Hien Lê, Zafer Kocaoglu, Francesco Maizza, Anita Mezzetti, Nataliia Surianinova
Project structure:
import itertools
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import collections
# sklearn
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, MultiLabelBinarizer, StandardScaler, MinMaxScaler
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score
from sklearn.metrics import confusion_matrix, roc_curve, auc, roc_auc_score
from sklearn.metrics import make_scorer
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.utils import shuffle
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn.linear_model import ElasticNet
from sklearn.linear_model import LinearRegression
from sklearn import linear_model
from sklearn.metrics import mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from xgboost import XGBRegressor
from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE
from collections import OrderedDict, Counter
from matplotlib import pyplot
from random import randint
warnings.filterwarnings("ignore")
pd.options.mode.chained_assignment = None # default='warn'
class color:
PURPLE = '\033[95m'
CYAN = '\033[96m'
DARKCYAN = '\033[36m'
BLUE = '\033[94m'
GREEN = '\033[92m'
YELLOW = '\033[93m'
RED = '\033[91m'
BOLD = '\033[1m'
UNDERLINE = '\033[4m'
END = '\033[0m'
import tensorflow as tf
import tensorflow.keras as tfk
import tensorflow.keras.layers as tfkl
import tensorflow_probability as tfp
# import data
movie_data = pd.read_csv(r'data/movie_metadata.csv')
print("The features of the movie dataset are:")
print(movie_data.dtypes)
We check what columns are actually useful. In particular we analyse four of them.
We check that the color column is binary: a film is 'black and white' or is 'color'.
We drop the color column, because we want to help a modern film company and it is gonna make color films quite surely. Nowadays, the decision to produce a 'black and white' movie is an usual choice, driven by artistic needs.
In the next section (year) we will keep only movies produced after the 1980 (we will explain why). We can ignore the color column because mosto of recent movies are 'Color'.
Firstly we verify that 'black and white' or is 'color' are the only values of the column:
colors = movie_data.color.unique() # values in the column color
color_values = [x for x in colors if str(x) != 'nan'] # drop nan
#color_values[1] = color_values[1].strip() # delete spaces before 'black'
print(f"The color column has only {len(color_values)} different values: '{color_values[0]}' and '{color_values[1]}'.")
# dataframe years and colors:
year_color = movie_data[['color', 'title_year']] # we only need color and year
year_color = year_color[~year_color.color.isnull()] # deletete color NaN values
year_color = year_color[~year_color.title_year.isnull()] # deletete title NaN values
year_color = year_color.sort_values('title_year') # sort by year
years = year_color.title_year.drop_duplicates().tolist() # years
n_years = len(years)
for i in range(n_years):
year = int(years[i]) # year
year_movies = year_color.loc[year_color['title_year'] == years[i]].color.values # all colors of film of that year
# number black and white and color movies for each year
num_c = (year_movies == color_values[0]).sum()
num_bw = (year_movies == color_values[1]).sum()
# dataframe: for each row year, num colors, num blackwhite
d = {'year':year , 'num_color':num_c , 'num_black_white':num_bw }
if i==0 :
year_number_colors = pd.DataFrame( data=d , index=[year] )
else:
#year_number_colors = pd.concat(d, ignore_index=True)
d2 = pd.DataFrame( data=d , index=[year])
year_number_colors = year_number_colors.append(d2, ignore_index=True)
print("For each year, we check how many films are colored and how many black and white:")
year_number_colors = year_number_colors.set_index('year')
year_number_colors.head()
year_number_colors['year']=year_number_colors.index
f=year_number_colors.plot(x="year", y=["num_color", "num_black_white"], kind="bar")
plt.yscale('log', nonposy='clip')
plt.ylabel('# movies')
plt.title("Movies colors")
for ind, label in enumerate(f.get_xticklabels()):
if ind % 10 == 0: # every 10th label is kept
label.set_visible(True)
else:
label.set_visible(False)
#f.set_yscale("log")
We can see that most of movies in recent years are 'color', so this column is not a relevant feature. We can drop it
movie_data = movie_data.drop('color', axis=1) # drop color column
Custumers' tastes change really fastly. We have already decided to drop black and white films, because they are not relevant for the analysis. In the same way, we can ignore films which were produced before 1980. As it can be seen in this histogram, this choice does not drop the majority of the movies, which were produced after 1980.
movie_data.title_year.hist( histtype='bar', width = 8)
plt.title("Number of movies")
plt.ylabel("# movies")
plt.xlabel("Year")
plt.show()
movie_data = movie_data[movie_data.title_year>=1980] # drop film before 1980
movie_data.title_year.hist(width = 3)
plt.title("Number of movies from 1980")
plt.ylabel("# movies")
plt.xlabel("Year")
plt.show()
We want to anlyse if the aspect ratio is an intresting feature. Nowadays, there are some rules which says the ratio that film-makers must use (deeper reserch in the report). For example wikipedia says (https://en.wikipedia.org/wiki/Aspect_ratio_(image)) :
The most common aspect ratios used today in the presentation of films in cinemas are 1.85:1 and 2.39:1. Two common videographic aspect ratios are 4:3 (1.3:1), the universal video format of the 20th century, and 16:9 (1.7:1), universal for high-definition television and European digital television. Other cinema and video aspect ratios exist, but are used infrequently.
We investigate for each year what aspect ratios were used.
ar = movie_data.aspect_ratio.unique()
ar = [x for x in ar if str(x) != 'nan'] # drop nan
ar = sorted(ar) # sort
print("The aspect ratios which were used were:")
print(*ar, sep = "\n")
# dataframe years and aspect ratio:
year_ar = movie_data[['title_year', 'aspect_ratio']] # we only need color and year
year_ar = year_ar[~year_ar.aspect_ratio.isnull()] # deletete color NaN values
year_ar = year_ar[~year_ar.title_year.isnull()] # deletete title NaN values
year_ar = year_ar.sort_values('title_year') # sort by year
years = year_ar.title_year.drop_duplicates().tolist() # years
n_years = len(years)
for i in range(n_years):
year = int(years[i]) # year
year_movies = year_ar.loc[year_ar['title_year'] == years[i]].aspect_ratio.values # all colors of film of that year
# number of each of the aspect ratios
num = []
for j in range(len(ar)): # for each aspect ratio
sum_ar = (year_movies == ar[j]).sum()
num.append(sum_ar)
# dataframe: for each row year, num of each aspect ratio
d = dict(zip(ar, num)) # dictionary to create the dataframe
d['year'] = year
if i==0 :
year_number_ar = pd.DataFrame( data = d , index = [year] )
else:
#year_number_colors = pd.concat(d, ignore_index=True)
d2 = pd.DataFrame( data = d , index = [year])
year_number_ar = year_number_ar.append(d2, ignore_index=True)
print("For each year, we check the number of movies for each ratio:")
year_number_ar = year_number_ar.set_index('year')
year_number_ar.head()
From this Data Frame we can see that 1.85 and 2.35 have way more movies than all the others ratios. We can plot these two and the sum of all the other and see that the other are irrelevant.
#we sum all the other columns:
year_number_ar['year'] = year_number_ar.index
year_number_ar['sum'] = year_number_ar[year_number_ar.columns[0]]
lista = [1,2,3,4,5,6,7,8,10,11,12,13,15,16,17,18,19]
for i in range(len(lista)):
index = lista[i]
year_number_ar['sum'] = year_number_ar['sum'] + year_number_ar[year_number_ar.columns[index]]
# plot:
# identification of the columns
ar_185 = year_number_ar.columns[9]
ar_235 = year_number_ar.columns[14]
ar_sum = year_number_ar.columns[21]
year_number_ar.plot(kind='line',x='year', y=[ar_185,ar_235, ar_sum])
plt.legend(loc='upper right')
plt.xlabel('Length of URL')
plt.ylabel('# movies')
plt.legend(['1.85','2.35','sum of all other ratios'])
plt.title("Number of movies for each year for different aspect ratios")
plt.show()
This plot shows that in the last years most of the movies use only two aspect ratios. So, it is not a intresting feature: probably there is some obbligation and our film company should not use this characteristic to decide the next investment.
In particular, we see a drop in the orange line after 2012. This does not mean that the customers prefer other aspect ratio. This is due to a general drop in film production. In fact, also other lines goes down. To show this we can use percentage:
ar_185_perc = year_number_ar[year_number_ar.columns[9]].values
ar_235_perc = year_number_ar[year_number_ar.columns[14]].values
ar_sum_perc = year_number_ar[year_number_ar.columns[21]].values
at_tot = ar_185_perc + ar_235_perc + ar_sum_perc
years = year_number_ar.index.values
plt.plot(years,ar_185_perc/at_tot)
plt.plot(years,ar_235_perc/at_tot)
plt.plot(years,ar_sum_perc/at_tot)
plt.xlabel('year')
plt.ylabel('% movies')
plt.legend(['1.85','2.35','sum of all other ratios'])
plt.title("Number of movies for each year for different aspect ratios in %")
plt.show()
Using percentage we can notice that the 2.35 ratio remains the preveland one. The sum of all other ratios (different form 2.35 and 1.85) remains quite low. Due to the last years' rules for movie companies regarting the ratio (which must be optimal to see the movie at the cinema) we cand srop this column.
movie_data = movie_data.drop('aspect_ratio', axis=1) # drop aspect ratio column
Firstly, we plot for each country, the number of movies produced there
mlb = MultiLabelBinarizer()
country = movie_data.country.values
#encoded_country = pd.DataFrame(country, columns=mlb.classes_, index=movie_df.index)
count_country = Counter(country)
count_country = {k: v for k, v in sorted(count_country.items(), key=lambda item: item[1], reverse = True)}
count_country = dict(itertools.islice(count_country.items(), 12))
dd = pd.DataFrame(count_country.items())
dd = dd.set_index(0)
fig, ax = plt.subplots(figsize=(6,6))
dd.plot(kind='barh', legend = False, ax=ax)
ax.set_xlabel('Number of movies')
ax.set_ylabel('Country')
plt.show()
We have taken our data from different sources and we do not know, for movies which are not American, in which currencies the budget and the gross are. Theoretically, reading the information of our databases, the prices should be all in dollars. However, some movies (as the Japan film 'The Host') are in Yen.
Hence, considering that the large majority of movies are American and that we suppose to do our analysis for an American company, we can drop all movies which are not produced in USA. In this way we avoid potential and dangerous currency-related mistakes and we can focus on American people tastes.
From the movie_imdb_link we extract the IMDB_id.
# extraction of imdb_id
movie_data['imdb_id'] = movie_data.movie_imdb_link.str.split('/').map(lambda row: row[4])
movie_data = movie_data.drop('movie_imdb_link', axis=1) # drop link column
We noticed that some movies are repeated. So, we cannot just set imdb_id as index: this is not an unique key right now. Before that, we have to delete duplicate rows.
This is also the first step for the deletion of NaN values. In fact, a lot of NaN values derive from the fact that if a film is present twice, and not both lines have all the values.
print("Number of NaN values for each feature before dropping doplicate rows:")
movie_data.isna().sum()
From the DataFrame we delete duplicates movies. In order to do that, we sort our value for the number of NaN and we keep only the copy with the lower number of features with value NaN. This should not only delete duplicates, but also reduce the number of NaN for each feature.
movie_data = (
movie_data.iloc[movie_data.isnull().sum(axis=1).mul(-1).argsort()] # Sort by number of NaN
.drop_duplicates(subset='imdb_id', keep='last') # Remove duplicates taking last element
)
print("Number of NaN values for each feature after dropping doplicate rows:")
movie_data.isna().sum()
A lot of features have less missing values! This makes our DataFrame more consistent. Moreover, now we have an unique key: imdb_id. Now, we can set it as index.
It is useful to match this database with the other ones, that we need to fill the NaN values. The reason why we do not use the title to match them is that the title is not an unique key: it can be inconsistent between different data sets (eg. Capitalization).
We check that this 'imdb_id' is unique and we set it as index:
if movie_data.imdb_id.nunique() == len(movie_data['imdb_id'].values):
movie_data = movie_data.set_index('imdb_id') # add imdb_id as a new column
movie_data.head()
We use these two datasets to fill NaN values
movie_data.isna().sum()
This is the Wikipedia query that we used to download data:
SELECT ?movieLabel (MAX(?grosses) as ?gross) (MIN(?costs) as ?cost) (MIN(?publicationYears) as ?publicationYear) ?IMDBid
WHERE
{
?movie (wdt:P31/wdt:P279*) wd:Q11424;
wdt:P345 ?IMDBid;
wdt:P2142 ?grosses;
wdt:P577 ?publicationDate.
OPTIONAL {?movie wdt:P2130 ?costs.}
BIND(YEAR(?publicationDate) AS ?publicationYears)
FILTER(?publicationYears >= (1980))
SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
} GROUP BY ?movieLabel ?IMDBid
wiki_data = pd.read_csv('data/wiki_data.csv').drop('movieLabel', axis=1).set_index("IMDBid") # read data from the wiki_data csv
wiki_data = wiki_data.rename(columns={"gross":"gross_1"}) #rename gross column
wiki_data = wiki_data.sort_values(axis=0, by="IMDBid")
wiki_data.head()
# merge the two dataset on the imdb_id
movie_df = movie_data.merge(wiki_data, how='left', left_index=True, right_index=True)
movie_df.head()
# we added 3 columns:
movie_data.shape, movie_df.shape
Now we have some repeated columns: gross and costs.
We have to keep only one of them. Instead of deleting one without an analysis. we match them in order to delete as many NaN values as possible
# gross : gross from movie dataset
# gorss_1 : gross from wiki dataset
gross_movie_na = movie_df.gross.isna()
for i in range(len(gross_movie_na)): # for each film
if gross_movie_na[i]==True: # NaN value => we keep the new value
movie_df.gross[i] = movie_df.gross_1[i] # copy the wiki value in the movie one
# if it is not NaN we don't do anyting
# we delete the wiki gross column: we have used the values we need and now it's useless
movie_df = movie_df.drop('gross_1', axis=1) # drop link gross wiki column
# budget : cost from movie dataset
# cost : cost from wiki dataset
cost_movie_na = movie_df.budget.isna()
for i in range(len(cost_movie_na)): # for each film
if cost_movie_na[i]==True: # NaN value => we keep the new value
movie_df.budget[i] = movie_df.cost[i] # copy the wiki value in the movie one
# if it is not NaN we don't do anyting
# we delete the wiki gross column: we have used the values we need and now it's useless
movie_df = movie_df.drop(['cost','publicationYear'], axis=1) # drop link cost wiki column
Now we can see that the number of movies with empty values in gross and budget has decreased :)
movie_df.isna().sum()
tmdb_data = pd.read_csv('data/tmdb_movies_data.csv').set_index("imdb_id") # read data from the wiki_data csv
tmdb_data = tmdb_data.rename(columns={"budget":"budget_1"}) #rename gross column
tmdb_data = tmdb_data[['budget_1','keywords']]
tmdb_data.head()
movie_df = movie_df.merge(tmdb_data, how='left', left_index=True, right_index=True)
movie_df.head()
# budget : cost from movie dataset
# budget_1 : cost from TDM dataset
cost_movie_na = movie_df.budget.isna()
for i in range(len(cost_movie_na)): # for each film
if cost_movie_na[i]==True: # NaN value => we keep the new value
movie_df.budget[i] = movie_df.budget_1[i] # copy the wiki value in the movie one
# if it is not NaN we don't do anyting
# we delete the wiki gross column: we have used the values we need and now it's useless
movie_df = movie_df.drop('budget_1', axis=1) # drop link cost wiki column
# plot_keywords : from movie dataset
# keywords : from TDM dataset
keywords_movie_na = movie_df.plot_keywords.isna()
for i in range(len(keywords_movie_na)): # for each film
if keywords_movie_na[i]==True: # NaN value => we keep the new value
movie_df.plot_keywords[i] = movie_df.keywords[i] # copy the wiki value in the movie one
# if it is not NaN we don't do anyting
# we delete the wiki gross column: we have used the values we need and now it's useless
movie_df = movie_df.drop(['keywords'], axis=1) # drop link cost wiki column
movie_df.isna().sum()
# for now we fill gross and budget NaN in this way, then we will fill them in a smarter way
movie_df.gross = movie_df.gross.fillna(movie_df.gross.median())
movie_df.budget = movie_df.budget.fillna(0)
NotFamous = {'actor_2_name': 'No Second Actor', 'actor_1_name': 'No Actor', 'actor_3_name': 'No Third Actor'}
movie_df = movie_df.fillna(value=NotFamous)
NotFBlikes = {'actor_3_facebook_likes': 0, 'actor_2_facebook_likes': 0, 'actor_1_facebook_likes': 0}
movie_df = movie_df.fillna(value=NotFBlikes)
NotRated = {'content_rating': 'Not Rated'}
movie_df = movie_df.fillna(value=NotRated)
# we manually searched for them
movie_df.loc['tt3949660', 'facenumber_in_poster']=4
movie_df.loc['tt0903624', 'facenumber_in_poster']=1
movie_df.loc['tt3567288', 'facenumber_in_poster']=0
movie_df.loc['tt0989757', 'facenumber_in_poster']=2
movie_df.loc['tt1929263', 'facenumber_in_poster']=0
movie_df.loc['tt3014666', 'facenumber_in_poster']=6
movie_df=movie_df.dropna()
movie_df=movie_df.replace({'content_rating': {'Unrated': 'Not Rated'}})
movie_df.head()
movie_df.isna().sum()
mlb = MultiLabelBinarizer()
genres = movie_df.genres.str.split("|")
encoded_genres = pd.DataFrame(mlb.fit_transform(genres), columns=mlb.classes_, index=movie_df.index)
encoded_genres
count_encoded_genres = encoded_genres.sum().sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(6,6))
count_encoded_genres.plot(kind='barh', legend = False, ax=ax)
ax.set_xlabel('Number of movies')
ax.set_ylabel('Genre')
plt.show()
We can group togheter similar genres, in order to decrease the number of columns
# Mystery|Thriller|Horror
sum_gen = encoded_genres[['Mystery','Thriller','Horror']].sum(axis=1)
encoded_genres['Mystery_Thriller_Horror'] = np.array([sum_gen.values>0]).astype(int).T # add column if movie is one of them
encoded_genres = encoded_genres.drop(['Mystery','Thriller','Horror'], axis=1) #drop other
#Sci-Fi|Fantasy
sum_gen = encoded_genres[['Sci-Fi','Fantasy']].sum(axis=1)
encoded_genres['Sci-Fi_Fantasy'] = np.array([sum_gen.values>0]).astype(int).T # add column if movie is one of them
encoded_genres = encoded_genres.drop(['Sci-Fi','Fantasy'], axis=1) #drop other
#Family|Animation
sum_gen = encoded_genres[['Family','Animation']].sum(axis=1)
encoded_genres['Family_Animation'] = np.array([sum_gen.values>0]).astype(int).T # add column if movie is one of them
encoded_genres = encoded_genres.drop(['Family','Animation'], axis=1) #drop other
#Action|Adventure
sum_gen = encoded_genres[['Action','Adventure']].sum(axis=1)
encoded_genres['Action_Adventure'] = np.array([sum_gen.values>0]).astype(int).T # add column if movie is one of them
encoded_genres = encoded_genres.drop(['Action','Adventure'], axis=1) #drop other
#History|War
sum_gen = encoded_genres[['History','War']].sum(axis=1)
encoded_genres['History_War'] = np.array([sum_gen.values>0]).astype(int).T # add column if movie is one of them
encoded_genres = encoded_genres.drop(['History','War'], axis=1) #drop other
count_encoded_genres = encoded_genres.sum().sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(6,6))
count_encoded_genres.plot(kind='barh', legend = False, ax=ax)
ax.set_xlabel('Number of movies')
ax.set_ylabel('Genre')
plt.show()
At this point we can put togheter genres which are not common:
#Other
other = ['Music','Sport','Documentary','Musical','Western','Short','News']
sum_gen = encoded_genres[other].sum(axis=1)
encoded_genres['Others'] = np.array([sum_gen.values>0]).astype(int).T # add column if movie is one of them
encoded_genres = encoded_genres.drop(other, axis=1) #drop other
# Add genres columns to movie_df
movie_df = movie_df.merge(encoded_genres,how='left', left_index=True, right_index=True)
count_encoded_genres = encoded_genres.sum().sort_values(ascending=True)
fig, ax = plt.subplots(figsize=(6,6))
count_encoded_genres.plot(kind='barh', legend = False, ax=ax)
ax.set_xlabel('Number of movies')
ax.set_ylabel('Genre')
plt.show()
For each director and actor we assign a rank, which considers different features which characterise him/her. It will be useful to identify the most important who contruìibute to the movie.
group_director = movie_df.groupby('director_name') # group the database for the director
directors = list(movie_df.director_name.values) #director list
directors_unique =list(OrderedDict.fromkeys(directors)) # no duplicates
#number of movies for each director
#counter = collections.Counter(directors)
results = [directors.count(x) for x in directors_unique] #number of films
directors = pd.DataFrame([directors_unique, results], index=['director', 'num_movies']).T
directors = directors.set_index('director')
directors.head()
In the regression, we cannot use variables which are consequnces, as movie_facebook_likes. However, we can use them to understand how much a director is famous. If a film has a lot of likes, its director will probably earn a good reputation.
Same reasoning for imdb_score
# mean movie_facebook_likes for the movies did by that director
score = group_director.mean().imdb_score
directors = directors.merge(score, how='left', left_index=True, right_index=True)
directors.head(10)
print("Director:")
for col in ["num_movies", "imdb_score"]:
_, (ax1, ax2) = plt.subplots(figsize=(12,4), nrows=1, ncols=2)
sns.distplot(directors[col].fillna(0),color='g', ax=ax1)
sns.distplot(np.log(directors[col].fillna(0) + 1), color="r", ax=ax2)
for ax in (ax1, ax2):
ax.tick_params(size=12)
ax.set_ylabel("density", size=15)
ax1.set_title(f"Distribution of {col}", size=15)
ax2.set_title(f"Distribution of log-{col}", size=15)
Using these three features, we can create a ranking for the directors. We consider 10 levels. If a director is in level 10, he is really important: he has done a lot of movies with high ranking.
In order to do that, we
#we divide wach value for the max of that column in order to have values between 0 and 1
directors.num_movies = directors.num_movies/directors.num_movies.max()
directors.imdb_score = directors.imdb_score/directors.imdb_score.max()
#we consider that each column has the same weight and we do the mean
directors['mean_'] = directors.num_movies*1/2 + directors.imdb_score*1/2
massimo = directors.mean_.max()
directors['director_rank'] = [int(x) for x in directors.mean_/massimo*10]
directors = directors.sort_values(['director_rank','num_movies'], ascending=False)
directors.head(10)
print("Less influent directors:")
directors.sort_values(by=['director_rank']).head()
movie_recent_df = movie_df[movie_df.title_year>2000]
group_director = movie_recent_df.groupby('director_name')
directors = list(movie_recent_df.director_name.values)
directors_unique =list(OrderedDict.fromkeys(directors))
results = [directors.count(x) for x in directors_unique] #number of films
directors = pd.DataFrame([directors_unique, results], index=['director', 'num_movies']).T
directors = directors.set_index('director')
# mean movie_facebook_likes for the movies did by that director
likes = group_director.mean().movie_facebook_likes
score = group_director.mean().imdb_score
directors = directors.merge(score, how='left', left_index=True, right_index=True)
directors = directors.merge(likes, how='left', left_index=True, right_index=True)
directors.num_movies = directors.num_movies/directors.num_movies.max()
directors.movie_facebook_likes = directors.movie_facebook_likes/directors.movie_facebook_likes.max()
directors.imdb_score = directors.imdb_score/directors.imdb_score.max()
#we consider that each column has the same weight and we do the mean
directors['mean_'] = directors.num_movies*1/3 + directors.imdb_score*1/3 + directors.movie_facebook_likes*1/3
massimo = directors.mean_.max()
directors['director_rank'] = [int(x) for x in directors.mean_/massimo*10]
directors = directors.sort_values(['director_rank','num_movies'], ascending=False)
directors.head(15)
# actor_1_name
actor1 = movie_df.groupby('actor_1_name') # group the database for the actor_1_name
actors1 = list(movie_df.actor_1_name.values) #director list
actors1_unique =list(OrderedDict.fromkeys(actors1)) # no duplicates
#number of movies for each actor1
results = [actors1.count(x) for x in actors1_unique] #number of films
actors1 = pd.DataFrame([actors1_unique, results], index=['actors1', 'num_movies']).T
actors1 = actors1.set_index('actors1')
# mean actor_1_facebook_likes for the movies did by that director
num_likes = actor1.mean().actor_1_facebook_likes
actors1 = actors1.merge(num_likes, how='left', left_index=True, right_index=True)
actors1.head(5)
print("Actor1:")
for col in ["num_movies", "actor_1_facebook_likes"]:
_, (ax1, ax2) = plt.subplots(figsize=(15,5), nrows=1, ncols=2)
sns.distplot(actors1[col].fillna(0),color='g', ax=ax1)
sns.distplot(np.log(actors1[col].fillna(0) + 1), color="r", ax=ax2)
for ax in (ax1, ax2):
ax.tick_params(size=12)
ax.set_ylabel("density", size=15)
ax1.set_title(f"Distribution of {col}", size=15)
ax2.set_title(f"Distribution of log-{col}", size=15)
#we divide wach value for the max of that column in order to have values between 0 and 1
actors1.num_movies = actors1.num_movies/actors1.num_movies.max()
# log for facebook likes
max_likes = np.log(actors1.actor_1_facebook_likes.max())
actors1.actor_1_facebook_likes = [1 if x == 0 else x for x in actors1.actor_1_facebook_likes.values] # edit zeros to use log
actors1.actor_1_facebook_likes = np.log(actors1.actor_1_facebook_likes)/max_likes
#we consider that each column has the same weight and we do the mean
actors1['mean_'] = actors1.num_movies*1/2 + actors1.actor_1_facebook_likes*1/2
massimo = actors1.mean_.max()
actors1['actor1_rank'] = [int(x) for x in actors1.mean_/massimo*10]
actors1 = actors1.sort_values(['actor1_rank','num_movies'], ascending=False)
# add row for no actor
#actors1 = actors1.append(fill, ignore_index=False, verify_integrity=False, sort=None)
df2 = pd.DataFrame([['No Actor',0,0,0,0]],columns=['actors1','num_movies','actor_1_facebook_likes','mean_','actor1_rank']).set_index('actors1')
actors1 = actors1.append(df2)
actors1.head(10)
# actor_2_name
actor2 = movie_df.groupby('actor_2_name') # group the database for the actor_2_name
actors2 = list(movie_df.actor_2_name.values) #director list
actors2_unique =list(OrderedDict.fromkeys(actors2)) # no duplicates
#number of movies for each actor2
results = [actors2.count(x) for x in actors2_unique] #number of films
actors2 = pd.DataFrame([actors2_unique, results], index=['actors2', 'num_movies']).T
actors2 = actors2.set_index('actors2')
# mean actor_2_facebook_likes for the movies did by that director
num_likes = actor2.mean().actor_2_facebook_likes
actors2 = actors2.merge(num_likes,how='left', left_index=True, right_index=True)
actors2.head(5)
print("Actor2:")
for col in ["num_movies", "actor_2_facebook_likes"]:
_, (ax1, ax2) = plt.subplots(figsize=(15,5), nrows=1, ncols=2)
sns.distplot(actors2[col].fillna(0),color='g', ax=ax1)
sns.distplot(np.log(actors2[col].fillna(0) + 1), color="r", ax=ax2)
for ax in (ax1, ax2):
ax.tick_params(size=12)
ax.set_ylabel("density", size=15)
ax1.set_title(f"Distribution of {col}", size=15)
ax2.set_title(f"Distribution of log-{col}", size=15)
#we divide wach value for the max of that column in order to have values between 0 and 1
actors2.num_movies = actors2.num_movies/actors2.num_movies.max()
# log for facebook likes
max_likes = np.log(actors2.actor_2_facebook_likes.max())
actors2.actor_2_facebook_likes = [1 if x == 0 else x for x in actors2.actor_2_facebook_likes.values] # edit zeros to use log
actors2.actor_2_facebook_likes = np.log(actors2.actor_2_facebook_likes)/max_likes
#we consider that each column has the same weight and we do the mean
actors2['mean_'] = actors2.num_movies*1/2 + actors2.actor_2_facebook_likes*1/2
massimo = actors2.mean_.max()
actors2['actor2_rank'] = [int(x) for x in actors2.mean_/massimo*10]
actors2 = actors2.sort_values(['actor2_rank','num_movies'], ascending=False)
# add row for no actor
#actors1 = actors1.append(fill, ignore_index=False, verify_integrity=False, sort=None)
df2 = pd.DataFrame([['No Second Actor',0,0,0,0]],columns=['actors2','num_movies','actor_2_facebook_likes','mean_','actor2_rank']).set_index('actors2')
actors2 = actors2.append(df2)
actors2.head(10)
# actor_3_name
actor3 = movie_df.groupby('actor_3_name') # group the database for the actor_3_name
actors3 = list(movie_df.actor_3_name.values) #director list
actors3_unique =list(OrderedDict.fromkeys(actors3)) # no duplicates
#number of movies for each actor3
results = [actors3.count(x) for x in actors3_unique] #number of films
actors3 = pd.DataFrame([actors3_unique, results], index=['actors3', 'num_movies']).T
actors3 = actors3.set_index('actors3')
# mean actor_3_facebook_likes for the movies did by that director
num_likes = actor3.mean().actor_3_facebook_likes
actors3 = actors3.merge(num_likes,how='left', left_index=True, right_index=True)
actors3.head(5)
print("Actor3:")
for col in ["num_movies", "actor_3_facebook_likes"]:
_, (ax1, ax2) = plt.subplots(figsize=(15,5), nrows=1, ncols=2)
sns.distplot(actors3[col].fillna(0),color='g', ax=ax1)
sns.distplot(np.log(actors3[col].fillna(0) + 1), color="r", ax=ax2)
for ax in (ax1, ax2):
ax.tick_params(size=12)
ax.set_ylabel("density", size=15)
ax1.set_title(f"Distribution of {col}", size=15)
ax2.set_title(f"Distribution of log-{col}", size=15)
#we divide wach value for the max of that column in order to have values between 0 and 1
actors3.num_movies = actors3.num_movies/actors3.num_movies.max()
# log for facebook likes
max_likes = np.log(actors3.actor_3_facebook_likes.max())
actors3.actor_3_facebook_likes = [1 if x == 0 else x for x in actors3.actor_3_facebook_likes.values] # edit zeros to use log
actors3.actor_3_facebook_likes = np.log(actors3.actor_3_facebook_likes)/max_likes
#we consider that each column has the same weight and we do the mean
actors3['mean_'] = actors3.num_movies*1/2 + actors3.actor_3_facebook_likes*1/2
massimo = actors3.mean_.max()
actors3['actor3_rank'] = [int(x) for x in actors3.mean_/massimo*10]
actors3 = actors3.sort_values(['actor3_rank','num_movies'], ascending=False)
# add row for no actor
#actors1 = actors1.append(fill, ignore_index=False, verify_integrity=False, sort=None)
df2 = pd.DataFrame([['No Third Actor',0,0,0,0]],columns=['actors3','num_movies','actor_3_facebook_likes','mean_','actor3_rank']).set_index('actors3')
actors3 = actors3.append(df2)
actors3.head()
Add director and actors ratings to movie_df:
movie_df['imdb_id'] = movie_df.index
movie_df = pd.merge(left=movie_df, right=directors.director_rank , left_on='director_name', right_on='director')
movie_df = pd.merge(left=movie_df, right=actors1.actor1_rank , left_on='actor_1_name', right_on='actors1')
movie_df = pd.merge(left=movie_df, right=actors2.actor2_rank , left_on='actor_2_name', right_on='actors2')
movie_df = pd.merge(left=movie_df, right=actors3.actor3_rank , left_on='actor_3_name', right_on='actors3')
movie_df = movie_df.sort_values(by='imdb_id').drop_duplicates(subset='imdb_id', keep='last').set_index('imdb_id')
movie_df
In this section we searched for the most famous actor and the most prominent director after 2000.
movie_recent_df = movie_df[movie_df.title_year>2000] # select movies after 2000
Director
group_director = movie_recent_df.groupby('director_name')
directors = list(movie_recent_df.director_name.values)
directors_unique =list(OrderedDict.fromkeys(directors))
results = [directors.count(x) for x in directors_unique] #number of films
directors = pd.DataFrame([directors_unique, results], index=['director', 'num_movies']).T
directors = directors.set_index('director')
# mean movie_facebook_likes for the movies did by that director
likes = group_director.mean().movie_facebook_likes
score = group_director.mean().imdb_score
directors = directors.merge(score, how='left', left_index=True, right_index=True)
directors = directors.merge(likes, how='left', left_index=True, right_index=True)
directors.num_movies = directors.num_movies/directors.num_movies.max()
directors.movie_facebook_likes = directors.movie_facebook_likes/directors.movie_facebook_likes.max()
directors.imdb_score = directors.imdb_score/directors.imdb_score.max()
#we consider that each column has the same weight and we do the mean
directors['mean_'] = directors.num_movies*1/3 + directors.imdb_score*1/3 + directors.movie_facebook_likes*1/3
massimo = directors.mean_.max()
directors['director_rank'] = [int(x) for x in directors.mean_/massimo*10]
directors = directors.sort_values(['director_rank','num_movies'], ascending=False)
string_dir = color.BOLD + directors.index[0] + color.END
print(f"The most prominent director after 2000 is {string_dir}")
# actor_1_name
actor1 = movie_recent_df.groupby('actor_1_name') # group the database for the actor_1_name
actors1 = list(movie_recent_df.actor_1_name.values) #director list
actors1_unique =list(OrderedDict.fromkeys(actors1)) # no duplicates
#number of movies for each actor1
results = [actors1.count(x) for x in actors1_unique] #number of films
actors1 = pd.DataFrame([actors1_unique, results], index=['actors1', 'num_movies']).T
actors1 = actors1.set_index('actors1')
# mean actor_1_facebook_likes for the movies did by that director
num_likes = actor1.mean().actor_1_facebook_likes
actors1 = actors1.merge(num_likes, how='left', left_index=True, right_index=True)
#we divide wach value for the max of that column in order to have values between 0 and 1
actors1.num_movies = actors1.num_movies/actors1.num_movies.max()
# log for facebook likes
max_likes = np.log(actors1.actor_1_facebook_likes.max())
actors1.actor_1_facebook_likes = [1 if x == 0 else x for x in actors1.actor_1_facebook_likes.values] # edit zeros to use log
actors1.actor_1_facebook_likes = np.log(actors1.actor_1_facebook_likes)/max_likes
#we consider that each column has the same weight and we do the mean
actors1['mean_'] = actors1.num_movies*1/2 + actors1.actor_1_facebook_likes*1/2
massimo = actors1.mean_.max()
actors1['actor1_rank'] = [int(x) for x in actors1.mean_/massimo*10]
actors1 = actors1.sort_values(['actor1_rank','num_movies'], ascending=False)
# add row for no actor
#actors1 = actors1.append(fill, ignore_index=False, verify_integrity=False, sort=None)
df2 = pd.DataFrame([['No Actor',0,0,0,0]],columns=['actors1','num_movies','actor_1_facebook_likes','mean_','actor1_rank']).set_index('actors1')
actors1 = actors1.append(df2)
string_act = color.BOLD + actors1.index[0] + color.END
print(f"The most prominent director after 2000 is {string_act}")
In this section we fil missed values for the budget
# Mean budget for each year with missing values
grouped_by_year = movie_df.groupby("title_year").budget.median()
sns.lineplot(grouped_by_year.index, grouped_by_year)
plt.ylabel("mean budget ")
plt.title("Mean budget for each year with missing values")
plt.show()
Even if the mean of budgets of each year is not zero, it can happens that the budget of some movies is zero (this means that the value was missed and we replaced it with zero)
# missed values
zero_budget = (movie_df.budget == 0).values
num_zero_budget = zero_budget.sum()
print(f"Number of missed budget values = {num_zero_budget}")
For each movie we have a rank of its director and its cast. These elements are the ones which influence most the movie's budget. Another feature which influence the cost of a movie is the genre. We will rank every movie considering cast/director and its genre
cast/director
# rank for the cast considering its 3 actors rank
fill_budget_df = movie_df[['actor1_rank','actor2_rank','actor3_rank']]
# the protagonist has a bigger weight than others: 50% first actor, 30% second actor, 20% third actor
fill_budget_df['rank_cast'] = fill_budget_df.actor1_rank * 0.5 + fill_budget_df.actor2_rank * 0.30 + fill_budget_df.actor2_rank * 0.2
fill_budget_df.head()
fill_budget_df = fill_budget_df.drop(columns= ['actor1_rank','actor2_rank','actor3_rank'])
fill_budget_df['director_rank'] = movie_df.director_rank
fill_budget_df.head()
genre
Some genres requires a bigger investment. We found some articles which proves that (see Report). In particular, the most expensive are
Average 65 milion
Then there is
Average 60 milion
Others have a mean of 20 milion.
Then we can assign to each movie a rank based on genre average cost:
movie_genres = movie_df[[
'Biography','Comedy','Crime','Drama','Romance','Mystery_Thriller_Horror',
'Sci-Fi_Fantasy','Family_Animation','Action_Adventure','History_War','Others']]
movie_genres.to_csv('data/data_genre.csv')
movie_genres.head()
genre_rank = []
for i in range(len(movie_genres)): #for each movie
index = movie_genres.index[i]
rowData = movie_genres.loc[ index , : ] # all genres of one movie
if rowData.Action_Adventure==1 or rowData['Sci-Fi_Fantasy']==1 or rowData.History_War==1 :
genre_rank.append(1)
elif rowData.Family_Animation==1 :
genre_rank.append(round(float(60/65),2))
else :
genre_rank.append(round(float(60/65),2))
fill_budget_df['genre_rank'] = genre_rank
fill_budget_df = fill_budget_df.sort_values('director_rank', ascending=False) #sort values
# scale 0-1 also for rank_cast and director_rank
fill_budget_df.rank_cast = fill_budget_df.rank_cast/fill_budget_df.rank_cast.max()
fill_budget_df.director_rank = [round(float(x),2) for x in fill_budget_df.director_rank]
fill_budget_df.director_rank = fill_budget_df.director_rank/fill_budget_df.director_rank.max()
Warning!!!! remember to comment that this is a ranking of importance. it does not mean that the costs are 0.3 0.3 0.3
We suppose each is equally important:
fill_budget_df['rank_for_budget'] = fill_budget_df.mean(axis=1)
fill_budget_df.head()
fill_budget_df = fill_budget_df.drop(columns= ['rank_cast','director_rank','genre_rank'])
fill_budget_df['budget'] = movie_df.budget
fill_budget_df = fill_budget_df.sort_values('rank_for_budget', ascending=False) #sort values
fill_budget_df.head()
Now each film has a rank of features which influence the budget. Keep all of them would be too complicate, then we identify a rank of 20 values.
massimo = fill_budget_df.rank_for_budget.max()
fill_budget_df['rank_for_budget_integer'] = [int(x) for x in fill_budget_df.rank_for_budget/massimo*2*10]
fill_budget_df.head()
# mean of each rank of budget, ignoring movies with missing budget (which would bias the mean)
# ignore zero budget movies
movies_with_budget_df = fill_budget_df.drop(fill_budget_df[fill_budget_df.budget==0].index)
movies_with_budget_df = movies_with_budget_df.drop(columns='rank_for_budget')
data = movies_with_budget_df.groupby(movies_with_budget_df.rank_for_budget_integer)
data = data.mean()
data.head()
Now for each rank (no movies under rank = 7) we have the mean of its movies. We can assign this value of budget to movies with missing budget
movies_missing_budget_df = fill_budget_df.drop(fill_budget_df[fill_budget_df.budget>0].index)
movies_missing_budget_df = movies_missing_budget_df.drop(columns='rank_for_budget')
movies_missing_budget_df.head()
print(f"We check that they are actually {len(movies_missing_budget_df)} movies")
# fill values
for i in range(len(movies_missing_budget_df)):
index = movies_missing_budget_df.index[i]
rowData = movies_missing_budget_df.loc[ index , : ] # all genres of one movie
rank_movie = int(rowData.rank_for_budget_integer)
budget_movie = data.loc[ rank_movie , : ].budget
movie_df.update(
movie_df.loc[movie_df.index == index, 'budget'].replace(0.0, budget_movie, regex=True)
)
# we should have filled all zero budget values. check:
zero_budget = (movie_df.budget==0).values
num_zero_budget = zero_budget.sum()
print(f"Number of missed budget values = {num_zero_budget}")
We create the gross budget ratio feature, an important feature to understand the profitability of the movies. Note that the creation of this variable does not cause any probles because we have substitue every zero budget values in the Filling Missed Budget Values part.
movie_df["gross_budget_ratio"] = movie_df.gross / movie_df.budget
movie_df.to_csv('data/data_regression_median.csv')
movie_df.to_csv('data/data_regression.csv')
Firstly, we plot a general description of our data:
movie_df.describe()
Distribution of the main numerical features of the data set Due the right-skewed of the distribution of budget andgross earning, it is appropriate to consider the log trasformation of these variables
for col in ["duration", "gross", "actor_1_facebook_likes","imdb_score","budget","gross_budget_ratio"]:
_, (ax1, ax2) = plt.subplots(figsize=(12,4), nrows=1, ncols=2)
sns.distplot(movie_df[col].fillna(0),color='g', ax=ax1)
sns.distplot(np.log(movie_df[col].fillna(0) + 1), color="r", ax=ax2)
for ax in (ax1, ax2):
ax.tick_params(size=12)
ax.set_ylabel("density", size=15)
ax1.set_title(f"Distribution of {col}", size=15)
ax2.set_title(f"Distribution of log-{col}", size=15)
we can see that distribution of budget and gross of movies are not so explanatory. This is due to the fact the the data sets includes both boxoffice movies and documentaries that have a significant difference in marketing and financial needs. So it might be interesting to look at the number of movies by genre.
grossde=movie_df['gross'].describe()
budgetde=movie_df['budget'].describe()
ratiogb=movie_df['gross_budget_ratio'].describe()
profitde=(movie_df['gross']-movie_df['budget']).describe()
summary = pd.DataFrame([grossde,budgetde,profitde,ratiogb]).T
summary.columns.values[2] = "profits"
summary#summary statistics of relevant variables
Now, we plot the SP 500 (see Report for details) for a comparison with the global economy
sp500 = (pd.read_csv(r'data/SeP500.csv')).iloc[::-1]
index = pd.period_range(start="01-01-1980", end="04-01-2020", freq="M")
sp500.index=index
sp500['Price'] = sp500['Price'].str.replace(',', '')
sp500['Price'] = sp500['Price'].astype(float)
sp500price=sp500.Price.resample("A").mean()
sp500price.plot(x_compat=True)
plt.title("SP500") # fra check the title
plt.ylabel("SP500")
plt.show()
FED
Interest rate as index of required return
fed = (pd.read_csv(r'data/fed.csv'))#.iloc[::-1]
index = pd.period_range(start="01-01-1980", end="03-01-2020", freq="M")
fed.index=index
fed['FEDFUNDS'] = fed['FEDFUNDS'].astype(float)
fedrate=fed.FEDFUNDS.resample("A").mean()
fedrate.plot(x_compat=True)
plt.ylabel("FED interest rate")
plt.title("FED interest rate for each year")
plt.show()
movie_df["gross_budget_ratio"] = movie_df.gross / movie_df.budget
grouped_by_year = movie_df.groupby("title_year").gross_budget_ratio.mean()
sns.lineplot(grouped_by_year.index, grouped_by_year)
plt.ylabel("mean gross / budget ratio")
plt.xticks(np.arange(min(grouped_by_year.index), max(grouped_by_year.index)+1, 4))
plt.title("mean gross / budget ratio overtime")
plt.show()
With this analysis we want to find out the relationship between a factor with the two indicators of a film's success: profitability and IMDB score.
movies_with_valid_profit = movie_df[movie_df.gross_budget_ratio<=20]
# again, don't see much influence here (between what?)
pivot=movie_df[['imdb_score',"gross_budget_ratio"]]
pivot=pivot[pivot.gross_budget_ratio<=20]
corp = pivot.apply(lambda x: pd.factorize(x)[0])#scaling
corp=corp.corr()
plt.figure(figsize=(8, 4))
ax = sns.heatmap(corp, xticklabels=corp.columns, yticklabels=corp.columns,
linewidths=.2, cmap="YlGnBu")
plt.yticks(rotation=0)
plt.show()
## finction for writing the correlation explicitely in the pairplot
def cor(x, y, **kwargs):
# Calculate the value
coef = np.corrcoef(x, y)[0][1]
# Make the label
label = r'$\rho$ = ' + str(round(coef, 2))
# Add the label to the plot
ax = plt.gca()
ax.annotate(label, xy = (0.2, 0.95), size = 15, xycoords = ax.transAxes)
grid=sns.pairplot(data=movie_df[movie_df.gross_budget_ratio<=20], vars=["imdb_score", "gross_budget_ratio"])
grid = grid.map_upper(cor)
If we remove the outliers of profitability ratio, we see that to have a profitability ratio higher than 5, a movie must have an IMDb score of 4 and above.
movies_with_imdb_classmean=[]
movies_with_imdb_classmean.append(movie_df[(1<movie_df.imdb_score) & (movie_df.imdb_score<=2.5) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movie_df[(2.5<movie_df.imdb_score) & (movie_df.imdb_score<=4) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movie_df[(4<movie_df.imdb_score) & (movie_df.imdb_score<=5.5) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movie_df[(5.5<movie_df.imdb_score) & (movie_df.imdb_score<=7) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movie_df[(7<movie_df.imdb_score) & (movie_df.imdb_score<=8.5) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movie_df[(8.5<movie_df.imdb_score) & (movie_df.imdb_score<=10) ].gross_budget_ratio.mean())
Limits =['1-2.5','2.5-4','4-5.5','5.5-7','7-8.5','8.5-10']
sns.barplot(Limits, movies_with_imdb_classmean)
plt.xticks(rotation=0)
plt.ylabel("Gross/Budget Ratio")
plt.xlabel('IMDb Scores')
plt.title(f"Mean Gross/Budget Ratios")
plt.show()
movies_with_gb_classmean=[]
movies_with_gb_classmean.append(movie_df[(0<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=1) ].imdb_score.mean())
movies_with_gb_classmean.append(movie_df[(1<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=2.5) ].imdb_score.mean())
movies_with_gb_classmean.append(movie_df[(2.5<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=5) ].imdb_score.mean())
movies_with_gb_classmean.append(movie_df[(5<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=10) ].imdb_score.mean())
movies_with_gb_classmean.append(movie_df[(10<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=20) ].imdb_score.mean())
movies_with_gb_classmean.append(movie_df[(20<movie_df.gross_budget_ratio)].imdb_score.mean())
Limits =['1-1','1-2.5','2.5-5','5-10','10-20','20+']
sns.barplot(Limits, movies_with_gb_classmean)
plt.xticks(rotation=0)
plt.ylabel("IMDb Scores")
plt.xlabel('Gross/Budget Ratios')
plt.title(f"Mean IMDb Scores")
plt.show()
datara=movie_df[['imdb_score','director_rank', 'actor1_rank', 'actor2_rank', 'actor3_rank',
'gross_budget_ratio']]
#datara = datara.apply(lambda x: pd.factorize(x)[0])
corre=datara.corr()
plt.figure(figsize=(12, 6))
ax = sns.heatmap(corre, xticklabels=corre.columns, yticklabels=corre.columns,
linewidths=.2, cmap="YlGnBu")
we can see that on average the rank of the director and cast does not have an impact on profitability. However, it is still worth check if this is a trend that is valid also fot top directors and actors since they are a significant part of the budget of a movie
datapi=movie_df[["cast_total_facebook_likes", "movie_facebook_likes", "director_facebook_likes",'imdb_score',"gross_budget_ratio","title_year"]]
datapi=datapi.loc[datapi.title_year>2009]
datapi=datapi.drop("title_year", axis=1)
datapi = datapi.apply(lambda x: pd.factorize(x)[0])
core=datapi.corr()
plt.figure(figsize=(12, 6))
ax = sns.heatmap(core, xticklabels=core.columns, yticklabels=core.columns,
linewidths=.2, cmap="YlGnBu")
Here, we realize that IMDb score and movie facebook likes might be related. Hence, we analyze these two variables.
movies_with_imdb_classmean2=[]
movies_with_imdb_classmean2.append(datapi[(1<movie_df.imdb_score) & (movie_df.imdb_score<=2.5) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(2.5<movie_df.imdb_score) & (movie_df.imdb_score<=4) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(4<movie_df.imdb_score) & (movie_df.imdb_score<=5.5) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(5.5<movie_df.imdb_score) & (movie_df.imdb_score<=7) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(7<movie_df.imdb_score) & (movie_df.imdb_score<=8.5) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(8.5<movie_df.imdb_score) & (movie_df.imdb_score<=10) ].cast_total_facebook_likes.mean())
Limits =['1-2.5','2.5-4','4-5.5','5.5-7','7-8.5','8.5-10']
sns.barplot(Limits, movies_with_imdb_classmean2)
plt.xticks(rotation=0)
plt.ylabel("cast_total_facebook_likes")
plt.xlabel('IMDb Scores')
plt.title(f"Mean total cast Facebook Likes")
plt.yticks(rotation=90)
plt.show()
movies_with_gb_classmean2=[]
movies_with_gb_classmean2.append(datapi[(0<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=1) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(1<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=2.5) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(2.5<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=5) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(5<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=10) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(10<movie_df.gross_budget_ratio) & (movie_df.gross_budget_ratio<=20) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(20<movie_df.gross_budget_ratio)].cast_total_facebook_likes.mean())
Limits =['1-1','1-2.5','2.5-5','5-10','10-20','20+']
sns.barplot(Limits, movies_with_gb_classmean2)
plt.xticks(rotation=0)
plt.ylabel("cast total Facebook Likes")
plt.xlabel('gross_budget_ratio')
plt.title(f"Mean total cast Facebook Likes")
plt.show()
##using genres_col,
GenreProf = movie_df[movie_df.gross_budget_ratio<=20].groupby(['Action_Adventure',
'Biography','Comedy','Crime','Drama','Family_Animation','History_War','Mystery_Thriller_Horror','Others','Romance','Sci-Fi_Fantasy'])['gross_budget_ratio'].mean()
GenreNumber = movie_df[movie_df.gross_budget_ratio<=20].groupby(['Action_Adventure',
'Biography','Comedy','Crime','Drama','Family_Animation','History_War', 'Mystery_Thriller_Horror','Others','Romance','Sci-Fi_Fantasy'])['gross_budget_ratio'].count()
Genrestable = GenreProf.to_frame()
Genrestable1 = Genrestable.reset_index()
Genrestable2 = Genrestable1.sort_values(by=['gross_budget_ratio'],ascending=False)
GenreNumber1 = GenreNumber.reset_index()
GenreNumber2 = GenreNumber1.reindex(Genrestable2.index)
Genrestable2
GenreNumber2
From the 2 dataframe above, we see that 3 movies with genres Drama,Crime and Romance together achieved 5.3 gross/budget ratio. However, since number of movies are low it might be due to some outlier. Hence, we try to analyze highest number of movies with the same genre combination.
GenreNumber3=GenreNumber2.sort_values(by=['gross_budget_ratio'],ascending=False)
Genrestable3 = Genrestable2.reindex(GenreNumber3.index)
GenreNumber3
Genrestable3
plt.figure()
genres =['Drame','Mystery-\nThriller-\nHorror','Comedy+\nDrama','Comedy+\nDrama+\nRomance','Comedy']
sns.barplot(genres, GenreNumber3.gross_budget_ratio[:5])
#Genrestable3.gross_budget_ratio[:5].plot(secondary_y=True)
plt.xticks(rotation=0)
plt.ylabel("Number of Movies")
plt.xlabel('Genre Combination')
plt.title(f"Movies")
plt.xticks(rotation=0)
plt.yticks(rotation=90)
axes2 = plt.twinx()
axes2.plot(genres, Genrestable3.gross_budget_ratio[:5], color='k')
axes2.set_ylim(0, 5)
axes2.set_ylabel('Mean Gross-Budget Ratio')
plt.show()
Here, we see that at the top 5 places all combinations have more than 140 movies. Hence, we can comment more confidently. We see that most of the segments include Drama in it and all of them has gross/budget ratio higher than 1.8 which is a nice number for film makers. By looking at the data, we can say that:
Hence, it might be logical to make a drama movie with some comedy and romance in it.
in order to test the validity on the results in the EDA also for top ranked directors is relevant check if the same trends can be found for a prominent director
director = "Christopher Nolan" # change this variable if you want to look into another director
director_films = movie_df.query(f"director_name == '{director}'").sort_values(by="title_year")
genres_col={'Biography', 'Comedy', 'Crime', 'Drama', 'Romance',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy', 'Family_Animation',
'Action_Adventure', 'History_War', 'Others'}
## genres of films directed and years in which a movie has been released
sums=director_films.sum()
print("Genres' counter\n")
for i in genres_col:
print('Group ',i,'=',sums[i])
print("\n")
print("Number of films per year \n")
print(director_films.title_year.value_counts())
print("\n")
director_titles_years = director_films["title_year"].map(int).map(str) + " - " + director_films["movie_title"].map(str)
graph=director_films["budget"].to_frame()
graph['gross_budget_ratio']=director_films["gross_budget_ratio"]
graph['gross']=director_films["gross"]
graph.index=director_titles_years
ax = plt.subplot(111)
ax2= ax.twinx()
graph.gross_budget_ratio.plot(ax=ax2, style='g-', secondary_y=True, legend='ratio')
ax.set_ylabel('gross and budget', color='b')
graph.plot(ax=ax,y=["budget", "gross"], kind="bar")
plt.ylabel("gross-budget ratio")
ax.tick_params(axis='x', rotation=90)
leg = ax.get_legend()
leg.remove() # remove it from ax
ax2.add_artist(leg) # add it to ax2
leg._set_loc(2)
plt.title("Gross budget levels overtime")
plt.show()
grouped_by_yearn = movie_df.groupby("title_year").gross_budget_ratio.mean()
grouped_by_yearn = grouped_by_yearn[grouped_by_yearn.index.isin(director_films.title_year)]
grouped_by_yearn=grouped_by_yearn.to_frame()
grouped_by_yearn.index=graph.index
grouped_by_yearn.rename(columns={"gross_budget_ratio": "gross_budget_ratio_mean"}, inplace=True)
grouped_by_yearn['gross_budget_ratio_nolan']=graph.gross_budget_ratio
ax = plt.subplot(111)
grouped_by_yearn.plot(ax=ax,y=["gross_budget_ratio_mean",'gross_budget_ratio_nolan'], kind="bar",legend='ratio')
ax.tick_params(axis='x', rotation=90)
plt.title("Gross budget ratio compared to average dataset")
plt.show()
grouped_by_yearn = movie_df.groupby("title_year").cast_total_facebook_likes.mean()
grouped_by_yearn = grouped_by_yearn[grouped_by_yearn.index.isin(director_films.title_year)]
grouped_by_yearn=grouped_by_yearn.to_frame()
grouped_by_yearn.index=graph.index
subdata=director_films
subdata.index=graph.index
grouped_by_yearn.rename(columns={"cast_total_facebook_likes": "cast_total_facebook_likes_mean"}, inplace=True)
grouped_by_yearn['cast_total_facebook_likes_nolan']=subdata.cast_total_facebook_likes
ax = plt.subplot(111)
grouped_by_yearn.plot(ax=ax,y=["cast_total_facebook_likes_mean",'cast_total_facebook_likes_nolan'], kind="bar",legend='ratio')
ax.tick_params(axis='x', rotation=90)
plt.title(f"cast total facebook likes for films of {director}")
plt.show()
grouped_by_yearn = movie_df.groupby("title_year").imdb_score.mean()
grouped_by_yearn = grouped_by_yearn[grouped_by_yearn.index.isin(director_films.title_year)]
grouped_by_yearn=grouped_by_yearn.to_frame()
grouped_by_yearn.index=graph.index
subdata=director_films
subdata.index=graph.index
grouped_by_yearn.rename(columns={"imdb_score": "imdb_score_mean"}, inplace=True)
grouped_by_yearn['imdb_score_nolan']=subdata.imdb_score
ax = plt.subplot(111)
grouped_by_yearn.plot(ax=ax,y=["imdb_score_mean",'imdb_score_nolan'], kind="bar",legend='ratio')
ax.tick_params(axis='x', rotation=90)
plt.title(f"IMDB score for films of {director}")
plt.show()
in order to test the validity on the results in the EDA also for top ranked actors is relevant check if the same trends can be found for a prominent actor
actor = "Johnny Depp" # change this variable if you want to look into another director
actor_films = movie_df.query(f"actor_1_name == '{actor}'").sort_values(by="title_year")
## genres of films in which the actor has appeared as main actor and years in which a movie has been released
sums=actor_films.sum()
print("Genres' counter\n")
for i in genres_col:
print('Group ',i,'=',sums[i])
print("\n")
print("Number of films per year \n")
print(actor_films.title_year.value_counts())
print("\n")
actor_titles_years = actor_films["title_year"].map(int).map(str) + " - " + actor_films["movie_title"].map(str)
graph=actor_films["budget"].to_frame()
graph['gross_budget_ratio']=actor_films["gross_budget_ratio"]
graph['gross']=actor_films["gross"]
graph.index=actor_titles_years
ax = plt.subplot(111)
ax2= ax.twinx()
graph.gross_budget_ratio.plot(ax=ax2, style='g-', secondary_y=True, legend='ratio')
ax.set_ylabel('gross and budget', color='b')
graph.plot(ax=ax,y=["budget", "gross"], kind="bar")
plt.ylabel("gross-budget ratio")
ax.tick_params(axis='x', rotation=90)
leg = ax.get_legend()
leg.remove() # remove it from ax
ax2.add_artist(leg) # add it to ax2
leg._set_loc(2)
plt.title("Gross budget levels overtime")
plt.show()
actor_titles_years = actor_films["title_year"]
graph=actor_films["budget"].to_frame()
graph['gross_budget_ratio']=actor_films["gross_budget_ratio"]
graph['gross']=actor_films["gross"]
graph.index=actor_titles_years
graph=graph.groupby(graph.index).mean()
dataj=movie_df[['gross_budget_ratio','title_year']]
dataj=dataj.loc[dataj['gross_budget_ratio']<25.372]
grouped_by_yearj = dataj.groupby("title_year").gross_budget_ratio.mean()
grouped_by_yearj = grouped_by_yearj[grouped_by_yearj.index.isin(actor_films.title_year)]
grouped_by_yearj=grouped_by_yearj.to_frame()
grouped_by_yearj.index=graph.index
grouped_by_yearj.rename(columns={"gross_budget_ratio": "gross_budget_ratio_mean"}, inplace=True)
grouped_by_yearj['gross_budget_ratio_depp']=graph.gross_budget_ratio
ax = plt.subplot(111)
grouped_by_yearj.plot(ax=ax,y=["gross_budget_ratio_mean",'gross_budget_ratio_depp'], kind="bar",legend='ratio')
ax.tick_params(axis='x', rotation=90)
plt.title("Gross budget ratio compared to average dataset")
plt.show()
actor_titles_years = actor_films["title_year"]
graph=actor_films["cast_total_facebook_likes"].to_frame()
graph.index=actor_titles_years
graph=graph.groupby(graph.index).mean()
dataj=movie_df[['cast_total_facebook_likes','title_year']]
grouped_by_yearj = dataj.groupby("title_year").cast_total_facebook_likes.mean()
grouped_by_yearj = grouped_by_yearj[grouped_by_yearj.index.isin(actor_films.title_year)]
grouped_by_yearj=grouped_by_yearj.to_frame()
grouped_by_yearj.index=graph.index
grouped_by_yearj.rename(columns={"cast_total_facebook_likes": "cast_total_facebook_likes_mean"}, inplace=True)
grouped_by_yearj['cast_total_facebook_likes_depp']=graph.cast_total_facebook_likes
ax = plt.subplot(111)
grouped_by_yearj.plot(ax=ax,y=["cast_total_facebook_likes_mean",'cast_total_facebook_likes_depp'], kind="bar",legend='ratio')
ax.tick_params(axis='x', rotation=90)
plt.title(f"cast total facebook likes for films with {actor} compared to dataset average ")
plt.show()
actor_titles_years = actor_films["title_year"]
graph=actor_films["imdb_score"].to_frame()
graph.index=actor_titles_years
graph=graph.groupby(graph.index).mean()
dataj=movie_df[['imdb_score','title_year']]
grouped_by_yearj = dataj.groupby("title_year").imdb_score.mean()
grouped_by_yearj = grouped_by_yearj[grouped_by_yearj.index.isin(actor_films.title_year)]
grouped_by_yearj=grouped_by_yearj.to_frame()
grouped_by_yearj.index=graph.index
grouped_by_yearj.rename(columns={"imdb_score": "imdb_score_mean"}, inplace=True)
grouped_by_yearj['imdb_score_depp']=graph.imdb_score
ax = plt.subplot(111)
grouped_by_yearj.plot(ax=ax,y=["imdb_score_mean",'imdb_score_depp'], kind="bar",legend='ratio')
ax.tick_params(axis='x', rotation=90)
plt.title(f"IMDb score for films with {actor} compared to dataset average ")
plt.show()
movieus_df = movie_df[movie_df.country == 'USA'] # keep only usa movies
movieus_df = movieus_df.drop(columns='country') # drop country column
Firstly, we plot a general description of our data:
movieus_df.describe()
Distribution of the main numerical features of the data set Due the right-skewed of the distribution of budget andgross earning, it is appropriate to consider the log trasformation of these variables
for col in ["duration", "gross", "actor_1_facebook_likes","imdb_score","budget","gross_budget_ratio"]:
_, (ax1, ax2) = plt.subplots(figsize=(12,4), nrows=1, ncols=2)
sns.distplot(movieus_df[col].fillna(0),color='g', ax=ax1)
sns.distplot(np.log(movieus_df[col].fillna(0) + 1), color="r", ax=ax2)
for ax in (ax1, ax2):
ax.tick_params(size=12)
ax.set_ylabel("density", size=15)
ax1.set_title(f"Distribution of {col}", size=15)
ax2.set_title(f"Distribution of log-{col}", size=15)
we can see that distribution of budget and gross of movies are not so explanatory. This is due to the fact the the data sets includes both boxoffice movies and documentaries that have a significant difference in marketing and financial needs. So it might be interesting to look at the number of movies by genre.
grossde=movieus_df['gross'].describe()
budgetde=movieus_df['budget'].describe()
ratiogb=movieus_df['gross_budget_ratio'].describe()
profitde=(movieus_df['gross']-movieus_df['budget']).describe()
summary = pd.DataFrame([grossde,budgetde,profitde,ratiogb]).T
summary.columns.values[2] = "profits"
summary#summary statistics of relevant variables
movies_with_valid_profit = movieus_df[movieus_df.gross_budget_ratio<=20]
# again, don't see much influence here (between what?)
pivot=movieus_df[['imdb_score',"gross_budget_ratio"]]
pivot=pivot[pivot.gross_budget_ratio<=20]
corp = pivot.apply(lambda x: pd.factorize(x)[0])#scaling
corp=corp.corr()
plt.figure(figsize=(8, 4))
ax = sns.heatmap(corp, xticklabels=corp.columns, yticklabels=corp.columns,
linewidths=.2, cmap="YlGnBu")
plt.yticks(rotation=0)
plt.show()
## finction for writing the correlation explicitely in the pairplot
def cor(x, y, **kwargs):
# Calculate the value
coef = np.corrcoef(x, y)[0][1]
# Make the label
label = r'$\rho$ = ' + str(round(coef, 2))
# Add the label to the plot
ax = plt.gca()
ax.annotate(label, xy = (0.2, 0.95), size = 15, xycoords = ax.transAxes)
grid=sns.pairplot(data=movieus_df[movieus_df.gross_budget_ratio<=20], vars=["imdb_score", "gross_budget_ratio"])
grid = grid.map_upper(cor)
If we remove the outliers of profitability ratio, we see that to have a profitability ratio higher than 5, a movie must have an IMDb score of 4 and above.
movies_with_imdb_classmean=[]
movies_with_imdb_classmean.append(movieus_df[(1<movieus_df.imdb_score) & (movieus_df.imdb_score<=2.5) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movieus_df[(2.5<movieus_df.imdb_score) & (movieus_df.imdb_score<=4) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movieus_df[(4<movieus_df.imdb_score) & (movieus_df.imdb_score<=5.5) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movieus_df[(5.5<movieus_df.imdb_score) & (movieus_df.imdb_score<=7) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movieus_df[(7<movieus_df.imdb_score) & (movieus_df.imdb_score<=8.5) ].gross_budget_ratio.mean())
movies_with_imdb_classmean.append(movieus_df[(8.5<movieus_df.imdb_score) & (movieus_df.imdb_score<=10) ].gross_budget_ratio.mean())
Limits =['1-2.5','2.5-4','4-5.5','5.5-7','7-8.5','8.5-10']
sns.barplot(Limits, movies_with_imdb_classmean)
plt.xticks(rotation=0)
plt.ylabel("Gross/Budget Ratio")
plt.xlabel('IMDb Scores')
plt.title(f"Mean Gross/Budget Ratios")
plt.show()
movies_with_gb_classmean=[]
movies_with_gb_classmean.append(movieus_df[(0<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=1) ].imdb_score.mean())
movies_with_gb_classmean.append(movieus_df[(1<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=2.5) ].imdb_score.mean())
movies_with_gb_classmean.append(movieus_df[(2.5<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=5) ].imdb_score.mean())
movies_with_gb_classmean.append(movieus_df[(5<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=10) ].imdb_score.mean())
movies_with_gb_classmean.append(movieus_df[(10<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=20) ].imdb_score.mean())
movies_with_gb_classmean.append(movieus_df[(20<movieus_df.gross_budget_ratio)].imdb_score.mean())
Limits =['1-1','1-2.5','2.5-5','5-10','10-20','20+']
sns.barplot(Limits, movies_with_gb_classmean)
plt.xticks(rotation=0)
plt.ylabel("IMDb Scores")
plt.xlabel('Gross/Budget Ratios')
plt.title(f"Mean IMDb Scores")
plt.show()
## influnce of Ranks
datara=movieus_df[['imdb_score','director_rank', 'actor1_rank', 'actor2_rank', 'actor3_rank',
'gross_budget_ratio']]
#datara = datara.apply(lambda x: pd.factorize(x)[0])
corre=datara.corr()
plt.figure(figsize=(12, 6))
ax = sns.heatmap(corre, xticklabels=corre.columns, yticklabels=corre.columns,
linewidths=.2, cmap="YlGnBu")
datapi=movieus_df[["cast_total_facebook_likes", "movie_facebook_likes", "director_facebook_likes",'imdb_score',"gross_budget_ratio","title_year"]]
datapi=datapi.loc[datapi.title_year>2009]
datapi=datapi.drop("title_year", axis=1)
datapi = datapi.apply(lambda x: pd.factorize(x)[0])
core=datapi.corr()
datara=movie_df[['imdb_score','director_rank', 'actor1_rank', 'actor2_rank', 'actor3_rank',
'gross_budget_ratio']]
#datara = datara.apply(lambda x: pd.factorize(x)[0])
corre=datara.corr()
plt.figure(figsize=(12, 6))
ax = sns.heatmap(core, xticklabels=corre.columns, yticklabels=corre.columns,
linewidths=.2, cmap="YlGnBu")
plt.figure(figsize=(12, 6))
ax = sns.heatmap(core, xticklabels=core.columns, yticklabels=core.columns,
linewidths=.2, cmap="YlGnBu")
Here, we realize that IMDb score and movie facebook likes might be related. Hence, we analyze these two variables.
movies_with_imdb_classmean2=[]
movies_with_imdb_classmean2.append(datapi[(1<movieus_df.imdb_score) & (movieus_df.imdb_score<=2.5) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(2.5<movieus_df.imdb_score) & (movieus_df.imdb_score<=4) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(4<movieus_df.imdb_score) & (movieus_df.imdb_score<=5.5) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(5.5<movieus_df.imdb_score) & (movieus_df.imdb_score<=7) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(7<movieus_df.imdb_score) & (movieus_df.imdb_score<=8.5) ].cast_total_facebook_likes.mean())
movies_with_imdb_classmean2.append(datapi[(8.5<movieus_df.imdb_score) & (movieus_df.imdb_score<=10) ].cast_total_facebook_likes.mean())
Limits =['1-2.5','2.5-4','4-5.5','5.5-7','7-8.5','8.5-10']
sns.barplot(Limits, movies_with_imdb_classmean2)
plt.xticks(rotation=0)
plt.ylabel("cast_total_facebook_likes")
plt.xlabel('IMDb Scores')
plt.title(f"Mean total cast Facebook Likes")
plt.yticks(rotation=90)
plt.show()
movies_with_gb_classmean2=[]
movies_with_gb_classmean2.append(datapi[(0<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=1) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(1<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=2.5) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(2.5<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=5) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(5<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=10) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(10<movieus_df.gross_budget_ratio) & (movieus_df.gross_budget_ratio<=20) ].cast_total_facebook_likes.mean())
movies_with_gb_classmean2.append(datapi[(20<movieus_df.gross_budget_ratio)].cast_total_facebook_likes.mean())
Limits =['1-1','1-2.5','2.5-5','5-10','10-20','20+']
sns.barplot(Limits, movies_with_gb_classmean2)
plt.xticks(rotation=0)
plt.ylabel("cast total Facebook Likes")
plt.xlabel('gross_budget_ratio')
plt.title(f"Mean total cast Facebook Likes")
plt.show()
Here, we see that at the top 4 places all combinations have more than 140 movies. Hence, we can comment more confidently. We see that all of the for segments include Drama in it and all of them has gross/budget ratio of 2 which is a nice number for film makers. By looking at the data, we can say that:
Hence, it might be logical to make a drama movie with some comedy and romance in it.
Considering all movies
gen_df = movie_df[['gross_budget_ratio','budget', 'gross']]
movie_genre_df = movie_df[['Biography','Comedy','Crime','Drama','Romance','Mystery_Thriller_Horror',
'Sci-Fi_Fantasy','Family_Animation','Action_Adventure','History_War', 'Others']]
column=['gross_budget_ratio']
gen_df = gen_df.merge(movie_genre_df, how='left', left_index=True, right_index=True)
genr = [
'Biography','Comedy','Crime','Drama','Romance','Mystery_Thriller_Horror',
'Sci-Fi_Fantasy','Family_Animation','Action_Adventure','History_War', 'Others']
column=['gross_budget_ratio']
prof_gen = pd.DataFrame(index=genr, columns=column)
for gen in genr:
ab = gen_df.loc[gen_df[gen]==1]
ab = ab.gross_budget_ratio.mean()
prof_gen['gross_budget_ratio'][gen]= ab
prof_gen=prof_gen.sort_values(by=['gross_budget_ratio'])
prof_gen.plot.bar()
# words_df is a ataframe which contains what we need for the keyword analysis
words_df =[]
words_df = movie_df[['plot_keywords','budget', 'gross']]
# a list with all the plot keywords for each movie
words = words_df.plot_keywords
words = [x.split("|") for x in words]
for i in range(len(words_df.plot_keywords)): # for each movie
words[i] = [x.split() for x in words[i]] # splitted words of that movie
w = []
# delete sub list in order to have one list for each movie and one one list for each word for each movie
for j in range(len(words[i])):
for k in range(len(words[i][j])):
if len(words[i][j][k])>2 and ' ' not in words[i][j][k]:
w.append(words[i][j][k])
words[i] = w
# delete duplicates
words[i] = list(OrderedDict.fromkeys(words[i]))
words_df.plot_keywords = words
words_df.head()
# we need the genre of each movie
movie_genre_df = pd.read_csv(r'data/data_genre.csv')
movie_genre_df = movie_genre_df.set_index('imdb_id')
words_df = words_df.merge(movie_genre_df, how='left', left_index=True, right_index=True)
words_df.head()
genres = [
'Biography','Comedy','Crime','Drama','Romance','Mystery_Thriller_Horror',
'Sci-Fi_Fantasy','Family_Animation','Action_Adventure','History_War', 'Others']
for genre in genres: # for each genre
print(color.BOLD + genre + color.END +":")
print('\n')
column = words_df[genre]
words_genre_df = words_df[column==1]
words_genre = []
num_film_word = [0]*len(words_genre_df) # num of movies of that genre in which that word appears
# delete sub list
for j in range(len(words_genre_df)):
for k in range(len(words_genre_df.plot_keywords[j])):
if len(words_genre_df.plot_keywords[j][k])>2 and ' ' not in words_genre_df.plot_keywords[j][k]:
words_genre.append(words_genre_df.plot_keywords[j][k])
# list of all words of this genre:
words_genre = list(dict.fromkeys(words_genre))
budget_word = [0] * len(words_genre)
gross_word = [0] * len(words_genre)
num_film_word = [0] * len(words_genre)
#for each word of that genre see the mean budget and mean gross of all movies of that genre which contain this word:
for i in range(len(words_genre)):
word = words_genre[i]
budget_word[i] = 0
gross_word[i] = 0
for j in range(len(words_genre_df)): # for all films of that genre
words_movie = words_genre_df.plot_keywords[j]
if word in words_movie:
num_film_word[i] = num_film_word[i] + 1
budget_word[i] = budget_word[i] + words_genre_df.budget[j]
gross_word[i] = gross_word[i] + words_genre_df.gross[j]
if num_film_word[i] > 2: # no if zero (not find) or an exception (only one/two)
# compute the means:
budget_word[i] = budget_word[i]/num_film_word[i]
gross_word[i] = gross_word[i]/num_film_word[i]
else:
words_genre[i] = 0
#words_genre = [i for i in words_genre if i != 0]
# create a dataframe for each genre
word_budget_df = pd.DataFrame([words_genre,budget_word, gross_word, num_film_word]).T
word_budget_df = word_budget_df.rename(columns={0:"word", 1:"mean_budget", 2:"mean_gross", 3:"num_film_word"})
word_budget_df = word_budget_df[word_budget_df.word != 0]
word_budget_df = word_budget_df[word_budget_df.num_film_word >3] # only words which appear in more than 3 movies
word_budget_df = word_budget_df.set_index('word')
word_budget_df["gross_budget_ratio"] = word_budget_df.mean_gross / word_budget_df.mean_budget
# print results
print("Order by budget:")
display(word_budget_df.sort_values('mean_budget', ascending=False)[['mean_budget', 'num_film_word']].T.head(10)) #order by budget
print("\n")
print("Order by gross:")
display(word_budget_df.sort_values('mean_gross', ascending=False)[['mean_gross', 'num_film_word']].T.head(10)) #order by budget
print("\n")
print("Order by ratio:")
display(word_budget_df.sort_values('gross_budget_ratio', ascending=False)[['gross_budget_ratio', 'num_film_word']].T.head(10)) #order by budget
print("\n")
words_df = movie_df[['plot_keywords']]
words = words_df.plot_keywords
words = [x.split("|") for x in words]
words = [" ".join(x) for x in words]
print(f"number of movies:{len(words_df)}")
words_df.head()
print(f"List of keywords for the first 5 movies:")
words[0:5]
count_model = TfidfVectorizer(ngram_range=(1,1))
v = count_model.fit_transform(words)
Xc = (v * v.T ) # this is co-occurrence matrix in sparse csr format
Xc.setdiag(0) # sometimes you want to fill same word cooccurence to 0
X = Xc.todense()
print(X.shape)
v.shape
movie_genre_df = movie_df[['Biography','Comedy','Crime','Drama','Romance','Mystery_Thriller_Horror','Sci-Fi_Fantasy','Family_Animation','Action_Adventure','History_War', 'Others']]
movie_genre_df.head()
target = []
# assign only one genre to each movie (the most influent one)
for i in range(len(movie_genre_df)):
index = movie_genre_df.index[i]
rowData = movie_genre_df.loc[ index , : ]
if rowData.Drama==1 :
target.append('Drama')
elif rowData.Comedy==1 :
target.append('Comedy')
elif rowData.Mystery_Thriller_Horror==1 :
target.append('Mystery_Thriller_Horror')
elif rowData.Action_Adventure==1 :
target.append('Action_Adventure')
elif rowData['Sci-Fi_Fantasy']==1 :
target.append('Sci-Fi_Fantasy')
elif rowData.Romance==1 :
target.append('Romance')
elif rowData.Crime==1 :
target.append('Crime')
elif rowData.Family_Animation==1 :
target.append('Family_Animation')
else :
target.append('Other')
# colors
colors = []
for i in range(9):
colors.append('#%06X' % randint(0, 0xFFFFFF))
target_no_dup = list(dict.fromkeys(target))
dic_tar_col = dict(zip(target_no_dup,colors))
dic_tar_col
len(target)
pca = PCA(n_components=2)
principalComponents = pca.fit_transform(X)
com1 = principalComponents[:,0]
com2 = principalComponents[:,1]
finalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1', 'principal component 2'])
finalDf['Target'] = target
# one row for each movie
finalDf.head()
pca.explained_variance_
pca.components_
fig = plt.figure(figsize = (10,10))
ax = fig.add_subplot(1,1,1)
ax.set_xlabel('Principal Component 1', fontsize = 15)
ax.set_ylabel('Principal Component 2', fontsize = 15)
ax.set_title('2 Component PCA', fontsize = 20)
for i in range(len(dic_tar_col)):
t = list(dic_tar_col.keys())[i]
cc = dic_tar_col.get(t)
indicesToKeep = finalDf['Target'] == t
ax.scatter(finalDf.loc[indicesToKeep, 'principal component 1']
, finalDf.loc[indicesToKeep, 'principal component 2']
, c = cc
, s = 50)
ax.legend(list(dic_tar_col.keys()))
ax.grid()
From this plot we can conclude that, in this case, PCA is not usefull to identify different genres. Colors are quite mixed indeed. What we hoped to obtain were distinguishable groups. Each group (each color) should have identified a set of words related to that specific genres. For example, for Romance keywords like kiss, love, Paris... The fact that all keywords are quite mixed is a proof of the fact that a word can be suitable for different genres. Then, it is hard to identify the groups.
movie_df.to_csv('data/data_regression_onlyUS.csv')
movie_df = pd.read_csv(r'data/data_regression.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes'])
classification_df.head()
classification_df.columns
classification1_df=classification_df.drop(columns=['country','language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
sns.barplot(['Loss','Low Profit','High Profit'],Counts)
plt.xticks(rotation=0)
plt.ylabel("Number of movies")
plt.title(f"Number of movies")
plt.show()
df11 = df1.copy()
df11['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df11
X = df11.drop(columns=['gross_budget_ratio'])
y = df11['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df12 = df1.copy()
df12['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df12.drop(columns=['gross_budget_ratio'])
y = df12['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['director_facebook_likes'] = df2['director_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_1_facebook_likes'] = df2['actor_1_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_2_facebook_likes'] = df2['actor_2_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_3_facebook_likes'] = df2['actor_3_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df21 = df2.copy()
df21['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df21
X = df21.drop(columns=['gross_budget_ratio'])
y = df21['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
sns.barplot(['Loss','Profitable'],Counts)
plt.xticks(rotation=0)
plt.ylabel("Number of movies")
plt.title(f"Number of movies")
plt.show()
df22 = df2.copy()
df22['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df22.drop(columns=['gross_budget_ratio'])
y = df22['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
clf3=RandomForestClassifier(criterion= 'entropy', max_depth= 10, n_estimators= 200,random_state=10)
clf3.fit(X_train, y_train)
features = X.columns
importances = clf3.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf3.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="b", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), [features[i] for i in indices])
plt.xticks(rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
ax = plt.subplot()
y_score = clf3.predict_proba(X_test)
Model_Predictions = clf3.predict(X_test)
auc_score = roc_auc_score(Model_Predictions, y_test)
fpr, tpr, _ = roc_curve(y_test, y_score[:,1])
plt.plot(fpr, tpr,color='darkorange', label='ROC curve (area = %0.2f)' %(auc_score))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
plt.legend(loc="lower right")
ax.set_title('RFC ROC Curve')
params = {
'n_estimators': [100, 200, 300, 400],
'max_depth': [2, 3, 6],
'learning_rate': [0.005, 0.01, 0.02],
'subsample': [0.4, 0.6, 0.8]
}
clf = GridSearchCV(XGBClassifier(silent=False,random_state=10), params, scoring =make_scorer(recall_score,pos_label=0))
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("XGBC best parameters:", clf.best_params_)
print("XGBC Train score:", clf.score(X_train, y_train))
print("XGBC Test score:", clf.score(X_test, y_test))
print("XGBC Test Accuracy score:", accuracy_score(y_pred, y_test))
clf=XGBClassifier(learning_rate= 0.005, max_depth= 2, n_estimators= 100, subsample= 0.6,random_state=10)
clf.fit(X_train, y_train)
features = X.columns
importances = clf.feature_importances_
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="b", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), [features[i] for i in indices])
plt.xticks(rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()
ax = plt.subplot()
Model_Predictions = clf.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('XGBoost Classifier')
ax = plt.subplot()
y_score = clf.predict_proba(X_test)
Model_Predictions = clf.predict(X_test)
auc_score = roc_auc_score(Model_Predictions, y_test)
fpr, tpr, _ = roc_curve(y_test, y_score[:,1])
plt.plot(fpr, tpr,color='darkorange', label='ROC curve (area = %0.2f)' %(auc_score))
plt.plot([0, 1], [0, 1], color='navy', linestyle='--')
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
plt.legend(loc="lower right")
ax.set_title('XGBC ROC Curve')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
movie_df = pd.read_csv(r'data/data_regression_onlyUS.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
sns.barplot(['Loss','Low Profit','High Profit'],Counts)
plt.xticks(rotation=0)
plt.ylabel("Number of movies")
plt.title(f"Number of movies")
plt.show()
df11 = df1.copy()
df11['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df11
X = df11.drop(columns=['gross_budget_ratio'])
y = df11['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
Counts2=[]
Counts2.append(Counts[1])
Counts2.append(Counts[0])
Counts2.reverse()
sns.barplot(['Loss','Profitable'],Counts2)
plt.xticks(rotation=0)
plt.ylabel("Number of movies")
plt.title(f"Number of movies")
plt.show()
df12 = df1.copy()
df12['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df12.drop(columns=['gross_budget_ratio'])
y = df12['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['director_facebook_likes'] = df2['director_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_1_facebook_likes'] = df2['actor_1_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_2_facebook_likes'] = df2['actor_2_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_3_facebook_likes'] = df2['actor_3_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df21 = df2.copy()
df21['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df21
X = df21.drop(columns=['gross_budget_ratio'])
y = df21['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df22 = df2.copy()
df22['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df22.drop(columns=['gross_budget_ratio'])
y = df22['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
movie_df = pd.read_csv(r'data/data_regression.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes','director_rank', 'actor1_rank', 'actor2_rank', 'actor3_rank'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['country','language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df11 = df1.copy()
df11['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df11
X = df11.drop(columns=['gross_budget_ratio'])
y = df11['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df12 = df1.copy()
df12['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df12.drop(columns=['gross_budget_ratio'])
y = df12['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['director_facebook_likes'] = df2['director_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_1_facebook_likes'] = df2['actor_1_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_2_facebook_likes'] = df2['actor_2_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_3_facebook_likes'] = df2['actor_3_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df21 = df2.copy()
df21['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df21
X = df21.drop(columns=['gross_budget_ratio'])
y = df21['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df22 = df2.copy()
df22['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df22.drop(columns=['gross_budget_ratio'])
y = df22['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
movie_df = pd.read_csv(r'data/data_regression_onlyUS.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes','director_rank', 'actor1_rank', 'actor2_rank', 'actor3_rank'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df11 = df1.copy()
df11['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df11
X = df11.drop(columns=['gross_budget_ratio'])
y = df11['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df12 = df1.copy()
df12['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df12.drop(columns=['gross_budget_ratio'])
y = df12['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['director_facebook_likes'] = df2['director_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_1_facebook_likes'] = df2['actor_1_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_2_facebook_likes'] = df2['actor_2_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['actor_3_facebook_likes'] = df2['actor_3_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df21 = df2.copy()
df21['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df21
X = df21.drop(columns=['gross_budget_ratio'])
y = df21['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df22 = df2.copy()
df22['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df22.drop(columns=['gross_budget_ratio'])
y = df22['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
movie_df = pd.read_csv(r'data/data_regression.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes','director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_facebook_likes', 'actor_1_facebook_likes'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['country','language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df11 = df1.copy()
df11['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df11
X = df11.drop(columns=['gross_budget_ratio'])
y = df11['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df12 = df1.copy()
df12['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df12.drop(columns=['gross_budget_ratio'])
y = df12['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df21 = df2.copy()
df21['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df21
X = df21.drop(columns=['gross_budget_ratio'])
y = df21['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df22 = df2.copy()
df22['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df22.drop(columns=['gross_budget_ratio'])
y = df22['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
movie_df = pd.read_csv(r'data/data_regression_onlyUS.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes','director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_facebook_likes', 'actor_1_facebook_likes'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df11 = df1.copy()
df11['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df11
X = df11.drop(columns=['gross_budget_ratio'])
y = df11['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df1['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df12 = df1.copy()
df12['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df12.drop(columns=['gross_budget_ratio'])
y = df12['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: 0 if x <= 3 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else (1 if x <= 2 else 2))
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df21 = df2.copy()
df21['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
df21
X = df21.drop(columns=['gross_budget_ratio'])
y = df21['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
smote=SMOTE('minority')
X_s, y_s = smote.fit_sample(X_train,y_train)
X_sm, y_sm = smote.fit_sample(X_s,y_s)
X_sm_train, y_sm_train = shuffle(X_sm, y_sm, random_state=10)
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l2','none']
parameters = {'C': C_List,'penalty':penalty}
MLR = LogisticRegression(multi_class='multinomial', solver='newton-cg')
clf1 = GridSearchCV(MLR, parameters, cv=5, verbose=0,scoring ='accuracy')
clf1.fit(X_sm_train, y_sm_train)
y_pred = clf1.predict(X_test)
print("MLR best parameters:", clf1.best_params_)
print("MLR Train score:", clf1.score(X_sm_train, y_sm_train))
print("MLR Test score:", clf1.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Multiclass Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring ='accuracy')
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Low Profit",'High Profit'])
ax.yaxis.set_ticklabels(["Loss","Low Profit",'High Profit'], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
RatioClass = df2['gross_budget_ratio'].transform(lambda x: 0 if x <= 1 else 1)
RatioClass = RatioClass.to_frame()
Counts=RatioClass['gross_budget_ratio'].value_counts()
Counts
df22 = df2.copy()
df22['gross_budget_ratio'] = RatioClass['gross_budget_ratio']
X = df22.drop(columns=['gross_budget_ratio'])
y = df22['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
C_List = [0.001,0.01,0.1,1,10,100,1000]
penalty = ['l1','l2']
parameters = {'C': C_List,'penalty':penalty}
LR = LogisticRegression( solver='liblinear')
clf1 = GridSearchCV(LR, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf1.fit(X_train, y_train)
y_pred = clf1.predict(X_test)
print("LR best parameters:", clf1.best_params_)
print("LR Train score:", clf1.score(X_train, y_train))
print("LR Test score:", clf1.score(X_test, y_test))
print("LR Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf1.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Logistic Regression')
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTC = DecisionTreeClassifier(random_state=10)
clf2 = GridSearchCV(DTC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf2.fit(X_train, y_train)
y_pred = clf2.predict(X_test)
print("DTC best parameters:", clf2.best_params_)
print("DTC Train score:", clf2.score(X_train, y_train))
print("DTC Test score:", clf2.score(X_test, y_test))
print("DTC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf2.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Decision Tree Classifier')
RFEstimatorList = [25,50,100,200]
criterion =['gini','entropy']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFC = RandomForestClassifier(random_state=10)
clf3 = GridSearchCV(RFC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("RFC best parameters:", clf3.best_params_)
print("RFC Train score:", clf3.score(X_train, y_train))
print("RFC Test score:", clf3.score(X_test, y_test))
print("RFC Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf3.predict(X_test)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('Random Forest Classifier')
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
Neighbor_List=[3,5,10,20,50]
parameters = {'n_neighbors':Neighbor_List}
KNNC = KNeighborsClassifier()
clf4 = GridSearchCV(KNNC, parameters, cv=5, verbose=0,scoring =make_scorer(recall_score,pos_label=0))
clf4.fit(X_train_scaled, y_train)
y_pred = clf4.predict(X_test_scaled)
print("KNN best parameters:", clf4.best_params_)
print("KNN Train score:", clf4.score(X_train_scaled, y_train))
print("KNN Test score:", clf4.score(X_test_scaled, y_test))
print("KNN Test Accuracy score:", accuracy_score(y_pred, y_test))
ax = plt.subplot()
Model_Predictions = clf4.predict(X_test_scaled)
Model_Confusion_Matrix = confusion_matrix(Model_Predictions,y_test)
sns.heatmap(Model_Confusion_Matrix,annot=True,fmt = "d",square = True,ax = ax, linewidths = 1,linecolor = "w",cmap = "Pastel2")
ax.set_xlabel('True labels')
ax.set_ylabel('Predicted labels')
ax.xaxis.set_ticklabels(["Loss","Profit"])
ax.yaxis.set_ticklabels(["Loss","Profit"], va="center")
b, t = plt.ylim()
b += 0.5
t -= 0.5
plt.ylim(b, t)
plt.title('KNN Classifier')
movie_df = pd.read_csv(r'data/data_regression.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['country','language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
X = df1.drop(columns=['gross_budget_ratio'])
y = df1['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['director_facebook_likes'] = df2['director_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_1_facebook_likes'] = df2['actor_1_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_2_facebook_likes'] = df2['actor_2_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_3_facebook_likes'] = df2['actor_3_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['gross_budget_ratio'] = df2['gross_budget_ratio'].transform(lambda x: np.log(0.1) if x <=0.1 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
X = df2.drop(columns=['gross_budget_ratio'])
y = df2['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
clf3=RandomForestRegressor(criterion= 'mse', max_depth= 10, n_estimators= 100,random_state=10)
clf3.fit(X_train, y_train)
features = X.columns
importances = clf3.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf3.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="b", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), [features[i] for i in indices])
plt.xticks(rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()
params = {
'n_estimators': [100, 200, 300, 400],
'max_depth': [2, 3, 6],
'learning_rate': [0.005, 0.01, 0.02],
'subsample': [0.4, 0.6, 0.8],
'objective' :['reg:squarederror']
}
clf = GridSearchCV(XGBRegressor(silent=False,random_state=10), params, scoring ='neg_mean_squared_error')
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
print("XGBR best parameters:", clf.best_params_)
print("XGBR score:", clf.score(X_test, y_test))
print("XGBR MSE:", mean_squared_error(y_test, clf.predict(X_test)))
sns.regplot(y_test, y_pred )
clf3=XGBRegressor(learning_rate= 0.02, max_depth= 3, n_estimators= 400, objective= 'reg:squarederror', subsample= 0.8,random_state=10)
clf3.fit(X_train, y_train)
features = X.columns
importances = clf3.feature_importances_
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="b", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), [features[i] for i in indices])
plt.xticks(rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()
movie_df = pd.read_csv(r'data/data_regression_onlyUS.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
X = df1.drop(columns=['gross_budget_ratio'])
y = df1['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['director_facebook_likes'] = df2['director_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_1_facebook_likes'] = df2['actor_1_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_2_facebook_likes'] = df2['actor_2_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_3_facebook_likes'] = df2['actor_3_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['gross_budget_ratio'] = df2['gross_budget_ratio'].transform(lambda x: np.log(0.1) if x <=0.1 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
X = df2.drop(columns=['gross_budget_ratio'])
y = df2['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
movie_df = pd.read_csv(r'data/data_regression.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes','director_rank', 'actor1_rank', 'actor2_rank', 'actor3_rank'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['country','language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
X = df1.drop(columns=['gross_budget_ratio'])
y = df1['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['director_facebook_likes'] = df2['director_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_1_facebook_likes'] = df2['actor_1_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_2_facebook_likes'] = df2['actor_2_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_3_facebook_likes'] = df2['actor_3_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['gross_budget_ratio'] = df2['gross_budget_ratio'].transform(lambda x: np.log(0.1) if x <=0.1 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
X = df2.drop(columns=['gross_budget_ratio'])
y = df2['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
movie_df = pd.read_csv(r'data/data_regression_onlyUS.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes','director_rank', 'actor1_rank', 'actor2_rank', 'actor3_rank'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
X = df1.drop(columns=['gross_budget_ratio'])
y = df1['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['director_facebook_likes'] = df2['director_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_1_facebook_likes'] = df2['actor_1_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_2_facebook_likes'] = df2['actor_2_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['actor_3_facebook_likes'] = df2['actor_3_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['gross_budget_ratio'] = df2['gross_budget_ratio'].transform(lambda x: np.log(0.1) if x <=0.1 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
X = df2.drop(columns=['gross_budget_ratio'])
y = df2['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
movie_df = pd.read_csv(r'data/data_regression.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes','director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_facebook_likes', 'actor_1_facebook_likes'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['country','language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
X = df1.drop(columns=['gross_budget_ratio'])
y = df1['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['gross_budget_ratio'] = df2['gross_budget_ratio'].transform(lambda x: np.log(0.1) if x <=0.1 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
X = df2.drop(columns=['gross_budget_ratio'])
y = df2['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
movie_df = pd.read_csv(r'data/data_regression_onlyUS.csv')
classification_df = movie_df.drop(columns=['imdb_id', 'director_name', 'num_critic_for_reviews',
'actor_2_name',
'gross', 'genres', 'actor_1_name',
'movie_title', 'num_voted_users',
'actor_3_name', 'plot_keywords',
'num_user_for_reviews',
'title_year', 'imdb_score',
'movie_facebook_likes','director_facebook_likes', 'actor_3_facebook_likes', 'actor_2_facebook_likes', 'actor_1_facebook_likes'])
classification_df
classification_df.columns
classification1_df=classification_df.drop(columns=['language'])
df1 = pd.get_dummies(classification1_df, columns=['content_rating'])
df1 = df1.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17'])
Q1 = df1['gross_budget_ratio'].quantile(0.25)
Q3 = df1['gross_budget_ratio'].quantile(0.75)
IQR = Q3 - Q1 #IQR is interquartile range.
filtering = (df1['gross_budget_ratio'] >= Q1 - 1.5 * IQR) & (df1['gross_budget_ratio'] <= Q3 + 1.5 *IQR)
df1=df1.loc[filtering]
df1
X = df1.drop(columns=['gross_budget_ratio'])
y = df1['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
df2 = df1.copy()
df2['budget'] = np.log(df2['budget'])
df2['cast_total_facebook_likes'] = df2['cast_total_facebook_likes'].transform(lambda x: np.log(0.2) if x <=0.2 else np.log(x))
df2['gross_budget_ratio'] = df2['gross_budget_ratio'].transform(lambda x: np.log(0.1) if x <=0.1 else np.log(x))
df2.columns
df2 = df2.drop(columns = ['facenumber_in_poster', 'Biography', 'Crime',
'Mystery_Thriller_Horror', 'Sci-Fi_Fantasy',
'History_War'])
X = df2.drop(columns=['gross_budget_ratio'])
y = df2['gross_budget_ratio']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=10)
print(f"Dimensionality of X_train: {X_train.shape}")
print(f"Dimensionality of X_test : {X_test.shape}")
print(f"Dimensionality of y_train: {y_train.shape}")
print(f"Dimensionality of y_test : {y_test.shape}")
MMScaler = MinMaxScaler()
X_train_scaled = MMScaler.fit_transform(X_train)
X_test_scaled = MMScaler.transform(X_test)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = Ridge()
clf1 = GridSearchCV(ridge, parameters, scoring ='neg_mean_squared_error')
clf1.fit(X_train_scaled, y_train)
y_pred = clf1.predict(X_test_scaled)
print("ridge best parameters:", clf1.best_params_)
print("ridge score:", clf1.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, clf1.predict(X_test_scaled)))
print("ridge best estimator coef:", clf1.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
clf3 = GridSearchCV(DTR, parameters, scoring ='neg_mean_squared_error')
clf3.fit(X_train, y_train)
y_pred = clf3.predict(X_test)
print("DTR best parameters:", clf3.best_params_)
print("DTR score:", clf3.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, clf3.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
clf4 = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='neg_mean_squared_error')
clf4.fit(X_train, y_train)
y_pred = clf4.predict(X_test)
print("RFR best parameters:", clf4.best_params_)
print("RFR score:", clf4.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, clf4.predict(X_test)))
sns.regplot(y_test, y_pred )
movie_df = pd.read_csv(r'data/data_regression.csv')
print(movie_df.shape)
movie_df.dtypes
movie_df.describe()
Here we use varaiables: duration,budget,genres,director rank, actor1 rank,actor2 rank,actor3 rank, movie facebook likes,cast total facebook likes, facenumber in poster
data1 = movie_df[['duration','budget','Biography','Comedy','Crime','Drama','Romance',
'Mystery_Thriller_Horror','Sci-Fi_Fantasy','Family_Animation','Action_Adventure',
'History_War','Others','director_rank',
'actor1_rank','actor2_rank','actor3_rank','movie_facebook_likes','cast_total_facebook_likes'
,'facenumber_in_poster']]
data1.head()
y = movie_df['imdb_score']
X_train, X_test, y_train, y_test = train_test_split(data1, y, test_size=0.30)
MM = MinMaxScaler(feature_range=(0, 1), copy=True)
X_train_MM = MM.fit_transform(X_train) # standardize X_train
X_test_MM = MM.transform(X_test) # standardize X_test
X_train_scaled = pd.DataFrame(data=X_train_MM, columns=X_train.columns)
X_test_scaled = pd.DataFrame(data=X_test_MM, columns=X_test.columns)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred= lr.predict(X_test_scaled)
print("linear regression R2:", r2_score(y_test, y_pred))
print("linear regression MSE:", mean_squared_error(y_test, y_pred, multioutput='raw_values'))
sns.regplot(y_test, y_pred )
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = linear_model.Ridge()
gridridge = GridSearchCV(ridge, parameters, scoring ='r2')
gridridge.fit(X_train_scaled, y_train)
y_pred = gridridge.predict(X_test_scaled)
print("ridge best parameters:", gridridge.best_params_)
print("ridge score:", gridridge.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, gridridge.predict(X_test_scaled)))
print("ridge best estimator coef:", gridridge.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
lasso = linear_model.Lasso()
gridlasso = GridSearchCV(lasso, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
en = linear_model.ElasticNet()
griden = GridSearchCV(en, parameters, scoring ='r2')
griden.fit(X_train_scaled, y_train)
y_pred = griden.predict(X_test_scaled)
print("Elastic Net best parameters:", griden.best_params_)
print("Elastic Net score:", griden.score(X_test_scaled, y_test))
print("Elastic Net MSE:", mean_squared_error(y_test, griden.predict(X_test_scaled)))
print("Elastic Net best estimator coef:", griden.best_estimator_.coef_)
sns.regplot(y_test, y_pred )
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
gridrf = GridSearchCV(DTR, parameters, scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("DTR best parameters:", gridrf.best_params_)
print("DTR score:", gridrf.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
gridrf = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("RFR best parameters:", gridrf.best_params_)
print("RFR score:", gridrf.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred)
Here we exclude genre from 1 Model
data2 = movie_df[['duration','budget','director_rank',
'actor1_rank','actor2_rank','actor3_rank'
,'movie_facebook_likes','cast_total_facebook_likes','facenumber_in_poster']]
y = movie_df['imdb_score']
X_train, X_test, y_train, y_test = train_test_split(data2, y, test_size=0.30)
MM = MinMaxScaler(feature_range=(0, 1), copy=True)
X_train_MM = MM.fit_transform(X_train) # standardize X_train
X_test_MM = MM.transform(X_test) # standardize X_test
X_train_scaled = pd.DataFrame(data=X_train_MM, columns=X_train.columns)
X_test_scaled = pd.DataFrame(data=X_test_MM, columns=X_test.columns)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred= lr.predict(X_test_scaled)
print("linear regression MSE:", mean_squared_error(y_test, y_pred, multioutput='raw_values'))
print("linear regression R2:", r2_score(y_test, y_pred))
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = linear_model.Ridge()
gridridge = GridSearchCV(ridge, parameters, scoring ='r2')
gridridge.fit(X_train_scaled, y_train)
y_pred = gridridge.predict(X_test_scaled)
print("ridge best parameters:", gridridge.best_params_)
print("ridge score:", gridridge.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, gridridge.predict(X_test_scaled)))
print("ridge best estimator coef:", gridridge.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
lasso = linear_model.Lasso()
gridlasso = GridSearchCV(lasso, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
en = linear_model.ElasticNet()
gridlasso = GridSearchCV(en, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
gridrf = GridSearchCV(DTR, parameters, scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("DTR best parameters:", gridrf.best_params_)
print("DTR score:", gridrf.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
gridrf = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("RFR best parameters:", gridrf.best_params_)
print("RFR score:", gridrf.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred)
Here we will transform budget s to log values.
data3 = movie_df[['duration','Biography','Comedy','Crime','Drama','Romance',
'Mystery_Thriller_Horror','Sci-Fi_Fantasy','Family_Animation','Action_Adventure',
'History_War','Others','director_rank',
'actor1_rank','actor2_rank','actor3_rank','movie_facebook_likes','cast_total_facebook_likes'
,'facenumber_in_poster']]
budget = np.log(movie_df['budget'])
data3['log_budget'] = budget
y = movie_df['imdb_score']
X_train, X_test, y_train, y_test = train_test_split(data3, y, test_size=0.30)
MM = MinMaxScaler(feature_range=(0, 1), copy=True)
X_train_MM = MM.fit_transform(X_train) # standardize X_train
X_test_MM = MM.transform(X_test) # standardize X_test
X_train_scaled = pd.DataFrame(data=X_train_MM, columns=X_train.columns)
X_test_scaled = pd.DataFrame(data=X_test_MM, columns=X_test.columns)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred= lr.predict(X_test_scaled)
print("linear regression MSE:", mean_squared_error(y_test, y_pred, multioutput='raw_values'))
print("linear regression R2:", r2_score(y_test, y_pred))
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = linear_model.Ridge()
gridridge = GridSearchCV(ridge, parameters, scoring ='r2')
gridridge.fit(X_train_scaled, y_train)
y_pred = gridridge.predict(X_test_scaled)
print("ridge best parameters:", gridridge.best_params_)
print("ridge score:", gridridge.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, gridridge.predict(X_test_scaled)))
print("ridge best estimator coef:", gridridge.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
lasso = linear_model.Lasso()
gridlasso = GridSearchCV(lasso, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
en = linear_model.ElasticNet()
gridlasso = GridSearchCV(en, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
gridrf = GridSearchCV(DTR, parameters, scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("DTR best parameters:", gridrf.best_params_)
print("DTR score:", gridrf.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
gridrf = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("RFR best parameters:", gridrf.best_params_)
print("RFR score:", gridrf.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred)
sns.regplot(movie_df['movie_facebook_likes'], movie_df['imdb_score'])
sns.regplot(movie_df['title_year'],movie_df['movie_facebook_likes'])
Here we include only movies with facebook likes and facebook cast likes
new = movie_df.query("movie_facebook_likes != 0")
y = new['imdb_score']
new = new.query("cast_total_facebook_likes != 0")
data4 = new[['duration','budget','Biography','Comedy','Crime','Drama','Romance',
'Mystery_Thriller_Horror','Sci-Fi_Fantasy','Family_Animation','Action_Adventure',
'History_War','Others','director_rank',
'actor1_rank','actor2_rank','actor3_rank'
,'movie_facebook_likes','cast_total_facebook_likes','facenumber_in_poster']]
y = new['imdb_score']
X_train, X_test, y_train, y_test = train_test_split(data4, y, test_size=0.30)
MM = MinMaxScaler(feature_range=(0, 1), copy=True)
X_train_MM = MM.fit_transform(X_train) # standardize X_train
X_test_MM = MM.transform(X_test) # standardize X_test
X_train_scaled = pd.DataFrame(data=X_train_MM, columns=X_train.columns)
X_test_scaled = pd.DataFrame(data=X_test_MM, columns=X_test.columns)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred= lr.predict(X_test_scaled)
print("linear regression MSE:", mean_squared_error(y_test, y_pred, multioutput='raw_values'))
print("linear regression R2:", r2_score(y_test, y_pred))
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = linear_model.Ridge()
gridridge = GridSearchCV(ridge, parameters, scoring ='r2')
gridridge.fit(X_train_scaled, y_train)
y_pred = gridridge.predict(X_test_scaled)
print("ridge best parameters:", gridridge.best_params_)
print("ridge score:", gridridge.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, gridridge.predict(X_test_scaled)))
print("ridge best estimator coef:", gridridge.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
lasso = linear_model.Lasso()
gridlasso = GridSearchCV(lasso, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
en = linear_model.ElasticNet()
gridlasso = GridSearchCV(en, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
gridrf = GridSearchCV(DTR, parameters, scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("DTR best parameters:", gridrf.best_params_)
print("DTR score:", gridrf.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
gridrf = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("RFR best parameters:", gridrf.best_params_)
print("RFR score:", gridrf.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred )
years = movie_df.query("title_year >= 2009")
data5 = years[['duration','budget','Biography','Comedy','Crime','Drama','Romance',
'Mystery_Thriller_Horror','Sci-Fi_Fantasy','Family_Animation','Action_Adventure',
'History_War','Others','director_rank',
'actor1_rank','actor2_rank','actor3_rank'
,'movie_facebook_likes','cast_total_facebook_likes','facenumber_in_poster']]
y = years['imdb_score']
X_train, X_test, y_train, y_test = train_test_split(data5, y, test_size=0.30)
MM = MinMaxScaler(feature_range=(0, 1), copy=True)
X_train_MM = MM.fit_transform(X_train) # standardize X_train
X_test_MM = MM.transform(X_test) # standardize X_test
X_train_scaled = pd.DataFrame(data=X_train_MM, columns=X_train.columns)
X_test_scaled = pd.DataFrame(data=X_test_MM, columns=X_test.columns)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred= lr.predict(X_test_scaled)
print("linear regression MSE:", mean_squared_error(y_test, y_pred, multioutput='raw_values'))
print("linear regression R2:", r2_score(y_test, y_pred))
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = linear_model.Ridge()
gridridge = GridSearchCV(ridge, parameters, scoring ='r2')
gridridge.fit(X_train_scaled, y_train)
y_pred = gridridge.predict(X_test_scaled)
print("ridge best parameters:", gridridge.best_params_)
print("ridge score:", gridridge.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, gridridge.predict(X_test_scaled)))
print("ridge best estimator coef:", gridridge.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
lasso = linear_model.Lasso()
gridlasso = GridSearchCV(lasso, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
en = linear_model.ElasticNet()
gridlasso = GridSearchCV(en, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
gridrf = GridSearchCV(DTR, parameters, scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("DTR best parameters:", gridrf.best_params_)
print("DTR score:", gridrf.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
gridrf = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("RFR best parameters:", gridrf.best_params_)
print("RFR score:", gridrf.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred)
clf3=RandomForestRegressor(criterion= 'mse', max_depth= 10, n_estimators= 100,random_state=10)
clf3.fit(X_train, y_train)
features = X_train.columns
importances = clf3.feature_importances_
std = np.std([tree.feature_importances_ for tree in clf3.estimators_],
axis=0)
indices = np.argsort(importances)[::-1]
# Print the feature ranking
print("Feature ranking:")
for f in range(X_train.shape[1]):
print("%d. feature %d (%f)" % (f + 1, indices[f], importances[indices[f]]))
# Plot the feature importances of the forest
plt.figure()
plt.title("Feature importances")
plt.bar(range(X_train.shape[1]), importances[indices],
color="b", yerr=std[indices], align="center")
plt.xticks(range(len(indices)), [features[i] for i in indices])
plt.xticks(rotation=90)
plt.xlim([-1, X_train.shape[1]])
plt.show()
MSE lower than in the model with likes, but R2 is lower
data5 = movie_df[['duration','budget','Biography','Comedy','Crime','Drama','Romance',
'Mystery_Thriller_Horror','Sci-Fi_Fantasy','Family_Animation','Action_Adventure',
'History_War','Others','director_rank',
'actor1_rank','actor2_rank','actor3_rank'
,'facenumber_in_poster']]
y = movie_df['imdb_score']
X_train, X_test, y_train, y_test = train_test_split(data5, y, test_size=0.30)
MM = MinMaxScaler(feature_range=(0, 1), copy=True)
X_train_MM = MM.fit_transform(X_train) # standardize X_train
X_test_MM = MM.transform(X_test) # standardize X_test
X_train_scaled = pd.DataFrame(data=X_train_MM, columns=X_train.columns)
X_test_scaled = pd.DataFrame(data=X_test_MM, columns=X_test.columns)
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred= lr.predict(X_test_scaled)
print("linear regression R2:", r2_score(y_test, y_pred))
print("linear regression MSE:", mean_squared_error(y_test, y_pred, multioutput='raw_values'))
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.1,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
ridge = linear_model.Ridge()
gridridge = GridSearchCV(ridge, parameters, scoring ='r2')
gridridge.fit(X_train_scaled, y_train)
y_pred = gridridge.predict(X_test_scaled)
print("ridge best parameters:", gridridge.best_params_)
print("ridge score:", gridridge.score(X_test_scaled, y_test))
print("ridge MSE:", mean_squared_error(y_test, gridridge.predict(X_test_scaled)))
print("ridge best estimator coef:", gridridge.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
lasso = linear_model.Lasso()
gridlasso = GridSearchCV(lasso, parameters, scoring ='r2')
gridlasso.fit(X_train_scaled, y_train)
y_pred = gridlasso.predict(X_test_scaled)
print("lasso best parameters:", gridlasso.best_params_)
print("lasso score:", gridlasso.score(X_test_scaled, y_test))
print("lasso MSE:", mean_squared_error(y_test, gridlasso.predict(X_test_scaled)))
print("lasso best estimator coef:", gridlasso.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
parameters = {'alpha': np.concatenate((np.arange(0.01,2,0.1), np.arange(2, 5, 0.5), np.arange(5, 25, 1)))}
en = lasso = linear_model.ElasticNet()
griden = GridSearchCV(en, parameters, scoring ='r2')
griden.fit(X_train_scaled, y_train)
y_pred = griden.predict(X_test_scaled)
print("Elastic Net best parameters:", griden.best_params_)
print("Elastic Net score:", griden.score(X_test_scaled, y_test))
print("Elastic Net MSE:", mean_squared_error(y_test, griden.predict(X_test_scaled)))
print("Elastic Net best estimator coef:", griden.best_estimator_.coef_)
sns.regplot(y_test, y_pred)
DTSplitList=[0.001,0.01,0.05,0.1,0.5,0.99]
DTLeafList=[0.0005,0.005,0.025,0.05,0.1,0.25,0.5]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-3)),'min_samples_leaf': DTLeafList, 'min_samples_split':DTSplitList}
DTR = DecisionTreeRegressor(random_state=10)
gridrf = GridSearchCV(DTR, parameters, scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("DTR best parameters:", gridrf.best_params_)
print("DTR score:", gridrf.score(X_test, y_test))
print("DTR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred )
RFEstimatorList = [25,50,100,200]
criterion =['mse']
parameters = {'criterion':criterion,'max_depth':list(range(100,0,-10)),'n_estimators':RFEstimatorList}
RFR = RandomForestRegressor(random_state=10)
gridrf = GridSearchCV(RFR, parameters, cv=5, verbose=0,scoring ='r2')
gridrf.fit(X_train, y_train)
y_pred = gridrf.predict(X_test)
print("RFR best parameters:", gridrf.best_params_)
print("RFR score:", gridrf.score(X_test, y_test))
print("RFR MSE:", mean_squared_error(y_test, gridrf.predict(X_test)))
sns.regplot(y_test, y_pred )
df = pd.read_csv("data/data_regression_median.csv", index_col=0)
#df = df[df.language == "English"]
df = pd.get_dummies(df, columns=['content_rating'])
df = df.drop(columns =['content_rating_Not Rated','content_rating_G','content_rating_X','content_rating_NC-17',
'content_rating_TV-14', 'content_rating_TV-G', 'content_rating_TV-PG'])
features_imdb = ["duration", "budget", *[f"{i}_rank" for i in ("director", "actor1", "actor2", "actor3")],
"movie_facebook_likes", "cast_total_facebook_likes", "Biography", "Comedy", "Crime", "Drama",
"Romance", "Mystery_Thriller_Horror", "Sci-Fi_Fantasy", "Family_Animation", "Action_Adventure",
"History_War", "Others", "facenumber_in_poster", "imdb_score",
'content_rating_PG', 'content_rating_PG-13', 'content_rating_R'
]
df_imdb = df[features_imdb]
df_imdb.columns
features_prof = ["duration", "budget",
*[f"{i}_facebook_likes" for i in ("director", "actor_1", "actor_2", "actor_3")],
*[f"{i}_rank" for i in ("director", "actor1", "actor2", "actor3")],
"movie_facebook_likes", "cast_total_facebook_likes", "Biography", "Comedy", "Crime", "Drama",
"Romance", "Mystery_Thriller_Horror", "Sci-Fi_Fantasy", "Family_Animation", "Action_Adventure",
"History_War", "Others", "facenumber_in_poster", "gross_budget_ratio",
'content_rating_PG', 'content_rating_PG-13', 'content_rating_R'
]
df_prof = df[features_prof]
df_imdb.columns
df_imdb.shape, df_prof.shape
def norm(x, x_stats):
return (x - x_stats["mean"])/x_stats["std"]
X = df_imdb.loc[:, ~df_imdb.columns.isin(["imdb_score"])]
X_stats = X.describe().transpose()
normed_X = norm(X, X_stats)
y1 = df_imdb.imdb_score
X_train, X_test, y_train, y_test = train_test_split(normed_X, y1, test_size=0.33, random_state=42)
# train_dataset = tf.data.Dataset.from_tensor_slices((np.array(X_train), np.array(y2_train)))
# test_dataset = tf.data.Dataset.from_tensor_slices((np.array(X_test), np.array(y2_test)))
normed_X.head()
X_train.shape, X_test.shape
for x in normed_X.columns:
sns.scatterplot(list(normed_X[x]), np.array(y1).reshape(y1.shape[0],))
plt.show()
# NOTE: THE LONG OUTPUT (500 EPOCHS) OF THIS CELL HAS BEEN DELETED
model = tfk.Sequential([
#tfk.Input(shape=(25,)),
#tfkl.BatchNormalization(), # normalise data here
# tfkl.Dense(64, activation='selu', input_shape=[X_train.shape[1]]),
tfkl.Dense(32, activation='selu'),
tfkl.Dense(1)
])
model.compile(optimizer=tf.optimizers.Adam(1e-3), loss=tfk.losses.mean_squared_error, metrics=["mse", "mae"])
model.fit(np.array(X_train), np.array(y_train), validation_split=0.2, batch_size=100,
epochs=700, workers=10, use_multiprocessing=True)
preds = model.predict(np.array(X_test)).flatten()
plt.scatter(preds, np.array(y_test).flatten(), alpha=0.25)
plt.ylabel("real imdb_score")
plt.xlabel("predicted imdb_score")
plt.title("Scatter Plot between real and predicted IMDB Scores", size=12)
plt.show()
errors = preds - np.array(y_test).flatten()
sns.distplot(errors) # We expect the distribution to be normal around zero!
plt.title("Distribution of errors of predicted IMDB Scores", size=12)
plt.xlabel("error")
plt.ylabel("density")
plt.show()
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
print(
mean_squared_error(np.array(y_test), preds),
mean_absolute_error(np.array(y_test), preds),
explained_variance_score(np.array(y_test), preds),
r2_score(np.array(y_test), preds)
)
sns.distplot(df_prof["gross_budget_ratio"])
sns.distplot(np.log(df_prof["gross_budget_ratio"]))
plt.title("Distribution of the log-transformed gross-budget ratios", size=12)
plt.ylabel("density")
plt.show()
df_cut = df_prof[df_prof.gross_budget_ratio < 5]
X_cut = df_cut.loc[:, ~df_cut.columns.isin(["gross_budget_ratio"])]
X_cut.shape
# not_include = ["imdb_score", "profitability", "num_voted_users", "title_year", "num_critic_for_reviews",
# "num_user_for_reviews"]
X_cut_stats = X_cut.describe().transpose()
normed_X_cut = norm(X_cut, X_cut_stats).reset_index(drop=True)
y2 = np.log(df_prof[df_prof["gross_budget_ratio"] < 5].gross_budget_ratio)
sns.distplot(y2)
# X remains the same, but labels are now y2
X_train, X_test, y_train, y_test = train_test_split(normed_X_cut, y2, test_size=0.33, random_state=42)
# NOTE: THE LONG OUTPUT (500 EPOCHS) OF THIS CELL HAS BEEN DELETED
model2 = tfk.Sequential([
#tfk.Input(shape=(25,)),
#tfkl.BatchNormalization(), # normalise data here
# tfkl.Dense(128, activation='selu', input_shape=[X_train.shape[1]]),
# tfkl.Dense(64, activation='selu'),
tfkl.Dense(32, activation='selu'),
tfkl.Dense(1)
])
model2.compile(optimizer=tf.optimizers.Adam(1e-3), loss=tfk.losses.mean_squared_error, metrics=["mse", "mae"])
model2.fit(np.array(X_train), np.array(y_train), validation_split=0.2, batch_size=128,
epochs=1000, workers=10, use_multiprocessing=True)
preds2 = model2.predict(X_test).flatten()
plt.scatter(preds2, np.array(y_test).flatten(), alpha=0.25)
plt.ylabel("real log(ratio)")
plt.xlabel("predicted log(ratio)")
plt.title("Scatter Plot between real and predicted log(Gross-Budget ratio)", size=12)
plt.show()
errors2 = preds2 - np.array(y_test).flatten()
sns.distplot(errors2) # We expect the distribution to be normal around zero!
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
print(
mean_squared_error(np.array(y_test), preds2),
mean_absolute_error(np.array(y_test), preds2),
explained_variance_score(np.array(y_test), preds2),
r2_score(np.array(y_test), preds2)
)
from tensorflow_probability import layers as tfpl
from tensorflow_probability import distributions as tfd
X_train, X_test, y_train, y_test = train_test_split(normed_X_cut, y2, test_size=0.33, random_state=42)
# NOTE: THE LONG OUTPUT (500 EPOCHS) OF THIS CELL HAS BEEN DELETED
model3 = tfk.Sequential([
tfpl.DenseFlipout(32, activation='sigmoid', input_shape=[X_train.shape[1]]),
tfkl.Dense(1, activation="sigmoid"),
tfpl.DistributionLambda(make_distribution_fn=lambda t: tfd.Normal(
loc=t, scale=1e-3 + tf.math.softplus(0.02 * t),
))
])
model3.compile(optimizer=tf.optimizers.Adam(1e-3), loss=tfk.losses.KLDivergence(), metrics=[])
model3.fit(np.array(X_train), np.array(y_train), validation_split=0.2, batch_size=150,
epochs=500, workers=10, use_multiprocessing=True)
preds3 = model3.predict(X_test).flatten()
plt.scatter(preds3, np.array(y_test).flatten(), alpha=0.25)
errors3 = preds3 - np.array(y_test).flatten()
sns.distplot(errors3)
from sklearn.metrics import mean_squared_error, mean_absolute_error, explained_variance_score, r2_score
print(
mean_squared_error(np.array(y_test), preds3),
mean_absolute_error(np.array(y_test), preds3),
explained_variance_score(np.array(y_test), preds3),
r2_score(np.array(y_test), preds3)
)
sns.boxplot(df_prof["gross_budget_ratio"]) # to show the ridiculous range of data
plt.title("Box plot of gross-budget ratio")
plt.show()
def classify_ratio(ratio):
if ratio < 1:
return 0
else:
return 1
df_cut["profitability_class"] = df_cut["gross_budget_ratio"].apply(lambda x : classify_ratio(x))
y3 = df_cut["profitability_class"].reset_index(drop=True)
#y3_encoded = tfk.utils.to_categorical(y3)
X_train, X_test, y_train, y_test = train_test_split(normed_X_cut, y3, test_size=0.4, random_state=42)
sns.boxplot(df_cut["gross_budget_ratio"])
plt.title("Box plot of gross-budget ratio after eliminating outliers")
y3[y3==1].shape, normed_X_cut[y3==1].shape
sns.countplot(y3)
plt.title("Count plot of binary gross-budget ratio categorisation")
plt.show()
y_train = tfk.utils.to_categorical(y_train)
y_test = tfk.utils.to_categorical(y_test)
# NOTE: THE LONG OUTPUT (500 EPOCHS) OF THIS CELL HAS BEEN DELETED
num_classes = 2
model4 = tfk.Sequential([
tfkl.Dense(32, activation="sigmoid",
kernel_regularizer=tfk.regularizers.l2(1e-4)),
tfkl.Dropout(0.2),
tfkl.Dense(num_classes, activation="softmax")
])
model4.compile(optimizer=tfk.optimizers.Adam(1e-2),
loss=[tf.keras.losses.categorical_crossentropy],
metrics=['accuracy', tfk.metrics.Recall()])
model4.fit(X_train.values, y_train, validation_split=0.33,
epochs=500, batch_size=128, use_multiprocessing=True, workers=10)
model4.evaluate(np.array(X_train), y_train, verbose=2)
model4.evaluate(np.array(X_test), y_test, verbose=2, batch_size=128)
preds4 = model4.predict(np.array(X_test))
preds4 # each row represents the probablities that this instance falls into one of the 3 classes
preds4_concrete = np.argmax(preds4, axis=1)
preds4_concrete
from sklearn.metrics import confusion_matrix, classification_report
print(classification_report(preds4_concrete, np.argmax(y_test, 1)))
def classify_ratio_3c(ratio):
if ratio < 1:
return 0
elif (ratio > 1) and (ratio < 2):
return 1
else:
return 2
df_cut["profitability_class"] = df_cut["gross_budget_ratio"].apply(lambda x : classify_ratio_3c(x))
y3 = df_cut["profitability_class"].reset_index(drop=True)
#y3_encoded = tfk.utils.to_categorical(y3)
X_train, X_test, y_train, y_test = train_test_split(normed_X_cut, y3, test_size=0.4, random_state=42)
y_train = tfk.utils.to_categorical(y_train)
y_test = tfk.utils.to_categorical(y_test)
sns.countplot(y3)
plt.title("Count plot of 3-class gross-budget ratio categorisation")
plt.show()
class_0_features = X_train[y_train == 0]
class_0_labels = y_train[y_train == 0]
class_1_features = X_train[y_train == 1].values
class_2_features = X_train[y_train == 2].values
class_1_labels = y_train[y_train == 1].values
class_2_labels = y_train[y_train == 2].values
ids = np.arange(len(class_1_features))
choices = np.random.choice(ids, class_0_features.shape[0])
res_class_1_features = class_1_features[choices]
res_class_1_labels = class_1_labels[choices]
ids = np.arange(len(class_2_features))
choices = np.random.choice(ids, class_0_features.shape[0])
res_class_2_features = class_2_features[choices]
res_class_2_labels = class_2_labels[choices]
resampled_X = np.concatenate([class_0_features.values, res_class_1_features, res_class_2_features])
resampled_y = np.concatenate([class_0_labels, res_class_1_labels, res_class_2_labels])
order = np.arange(len(resampled_y))
np.random.shuffle(order)
resampled_X = resampled_X[order]
resampled_y = resampled_y[order]
resampled_y_encoded = tfk.utils.to_categorical(resampled_y)
sns.countplot(resampled_y)
plt.title("Count plot of binary gross-budget ratio categorisation after rebalancing")
plt.show()
# class_0, class_1 = np.bincount(df_cut["profitability_class"])
# total = class_0 + class_1
# weight_for_0 = (1 / class_0)*(total)/2.0
# weight_for_1 = (1 / class_1)*(total)/2.0
# # weight_for_2 = (1 / class_2)*(total)/3.0
# class_weight = {0: weight_for_0, 1: weight_for_1}
# print('Weight for class 0: {:.2f}'.format(weight_for_0))
# print('Weight for class 1: {:.2f}'.format(weight_for_1))
num_classes = 3
model5 = tfk.Sequential([
# tfkl.Dense(128, activation="sigmoid"),
# tfkl.Dense(64, activation="sigmoid", input_shape=[X_train.shape[1]],
# kernel_regularizer=tfk.regularizers.l2(1e-4)),
# tfkl.Dropout(0.3),
tfkl.Dense(32, activation="sigmoid",
kernel_regularizer=tfk.regularizers.l2(1e-4)
),
tfkl.Dropout(0.3),
tfkl.Dense(num_classes, activation="softmax")
])
model5.compile(optimizer=tfk.optimizers.Adam(1e-3),
loss=[tf.keras.losses.categorical_crossentropy],
metrics=['accuracy', tfk.metrics.Recall()])
model5.fit(np.array(resampled_X), resampled_y_encoded, validation_split=0.33,
epochs=500, batch_size=128, use_multiprocessing=True, workers=10)
model5.evaluate(np.array(X_train), y_train, verbose=2)
model5.evaluate(np.array(X_test), y_test, verbose=2, batch_size=128)
preds5 = model5.predict(np.array(X_test))
preds5_concrete = np.argmax(preds5, axis=1)
print(classification_report(preds5_concrete, np.argmax(y_test, 1)))
num_classes = 3
model6 = tfk.Sequential([
# tfkl.Dense(64, activation="relu"),
# tfkl.Dropout(0.3),
tfkl.Dense(32, activation="sigmoid",
# kernel_regularizer=tfk.regularizers.l2(1e-6)
),
# tfkl.Dropout(0.3),
tfkl.Dense(num_classes, activation="softmax")
])
model6.compile(optimizer=tfk.optimizers.Adam(1e-3),
loss=[tf.keras.losses.categorical_crossentropy],
metrics=['accuracy', tfk.metrics.Recall()])
model6.fit(np.array(X_train), y_train, validation_split=0.33,
epochs=500, batch_size=128, use_multiprocessing=True, workers=10)
model6.evaluate(np.array(X_train), y_train, verbose=2)
model6.evaluate(np.array(X_test), y_test, verbose=2, batch_size=128)
preds6 = model6.predict(np.array(X_test))
preds6_concrete = np.argmax(preds6, axis=1)
print(classification_report(preds6_concrete, np.argmax(y_test, 1)))